Precipitation values for every 5 minutes to hourly summaries in R - r

I am trying to get the total precipitation values for every hour from a personal weather station I have using the weatherData package. The problem I have is that the data is collected every five minutes and the values repeat themselves until there is a change in precipitation value. I have tried the 'duplicated' function but I get a large number of data removed when there is no precipitation which makes it hard for me to get a summary of the hourly precipitation.
Please see code below
## Load required libraries
library(weatherData)
library(ggplot2)
library(scales)
library(plyr)
library(reshape2)
library(gridExtra)
library(lubridate)
library(weathermetrics)
library(zoo)
# Get data for PWS using weatherData package
pws <- getWeatherForDate("IPENANGB2", "2014-09-01","2014-09-30", station_type = "id",opt_detailed=T, opt_custom_columns=T, custom_columns=c(1,2,6,7,10))
# Rename columns
colnames(pws)<-c("time","time1","tempc","wdd","wspd","prcp")
## Adding date columns
pws$time<-as.POSIXct(pws$time1,format="%Y-%m-%d %H:%M:%S",tz="Australia/Perth")
pws$year <- as.numeric(format(pws$time,"%Y"))
pws$date <-as.Date(pws$time,format="%Y-%m-%d",tz="Australia/Perth")
pws$year <- as.numeric(as.POSIXlt(pws$date)$year+1900)
pws$month <- as.numeric(as.POSIXlt(pws$date)$mon+1)
pws$monthf <- factor(pws$month,levels=as.character(1:12),labels=c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"),ordered=TRUE)
pws$weekday <- as.POSIXlt(pws$date)$wday
pws$weekdayf <- factor(pws$weekday,levels=rev(0:6),labels=rev(c("Mon","Tue","Wed","Thu","Fri","Sat","Sun")),ordered=TRUE)
pws$yearmonth <- as.yearmon(pws$date)
pws$yearmonthf <- factor(pws$yearmonth)
pws$week <- as.numeric(format(as.Date(pws$date),"%W"))
pws$weekf<- factor(pws$week)
pws$jday<-yday(pws$date)
pws$hour <- as.numeric(format(strptime(pws$time, format = "%Y-%m-%d %H:%M"),format = "%H"))
pws$min <- as.numeric(format(strptime(pws$time, format = "%Y-%m-%d %H:%M"),format = "%M"))
# Remove duplicate values
pws.df <- pws[!duplicated(pws$prcp),]

Assuming you want to get hourly averages of tempc, wdd, wspd, prcp:
# used packages
library(weatherData)
library(lubridate)
library(dplyr)
library(stringr)
# read data
pws <- getWeatherForDate("IPENANGB2",
"2014-09-01",
"2014-09-30",
station_type = "id",
opt_detailed = T,
opt_custom_columns = T,
custom_columns = c(1, 2, 6, 7, 10))
# rename columns
colnames(pws) <- c("time", "time1", "tempc", "wdd", "wspd", "prcp")
# cleaning dataset and adding some columns
useful_pws <-
pws %>%
select(2:6) %>%
filter(!str_detect(time1, "<br>")) %>%
mutate(time1 = ymd_hms(time1),
year = year(time1),
month = month(time1),
day = day(time1),
hour = hour(time1)) %>%
tbl_df()
# summarising dataset
useful_pws %>%
select(-time1) %>%
group_by(year, month, day, hour) %>%
summarise(tempc = mean(tempc, na.rm = TRUE),
wdd = mean(wdd, na.rm = TRUE),
wspd = mean(wspd, na.rm = TRUE),
prcp = mean(prcp, na.rm = TRUE))

Related

Creating all possible variable combinations in R

I am having a daily dataset of 4 parameters which I have converted into monthly data using following code
library(zoo)
library(hydroTSM)
library(lubridate)
library(tidyverse)
set.seed(123)
df <- data.frame("date"= seq(from = as.Date("1983-1-1"), to = as.Date("2018-12-31"), by = "day"),
"Parameter1" = runif(length(seq.Date(as.Date("1983-1-1"), as.Date("2018-12-31"), "days")), 15, 35),
"Parameter2" = runif(length(seq.Date(as.Date("1983-1-1"), as.Date("2018-12-31"), "days")), 11, 29),
"Parameter3" = runif(length(seq.Date(as.Date("1983-1-1"), as.Date("2018-12-31"), "days")), 50, 90),
"Parameter4" = runif(length(seq.Date(as.Date("1983-1-1"), as.Date("2018-12-31"), "days")), 0, 27))
Monthly_data <- daily2monthly(df, FUN=mean, na.rm=TRUE)
After that, I have reshaped it to represent each column as month using following code
#Function to convert month abbreviation to a numeric month
mo2Num <- function(x) match(tolower(x), tolower(month.abb))
Monthly_data %>%
dplyr::as_tibble(rownames = "date") %>%
separate("date", c("Month", "Year"), sep = "-", convert = T) %>%
mutate(Month = mo2Num(Month))%>%
tidyr::pivot_longer(cols = -c(Month, Year)) %>%
pivot_wider(names_from = Month, values_from = value, names_prefix = "Mon",
names_sep = "_") %>%
arrange(name)
Now, I want to create parameter combinations like Parameter1 * Parameter2, Parameter1 * Parameter3, Parameter1 * Parameter4, Parameter2 * Parameter3, Parameter2 * Parameter4, Parameter3 * Parameter4 which will be added to the pivoted monthly data as rbind. The new dataframe Parameter1 * Parameter2 means to multiply their monthly values and then rbind to the above result. Likewise for all other above said combinations. How can I achieve this?
You can use this base R approach using combn assuming data is present for all the years for all parameters where df1 is the dataframe from the above output ending with arrange(name).
data <- combn(unique(df1$name), 2, function(x) {
t1 <- subset(df1, name == x[1])
t2 <- subset(df1, name == x[2])
t3 <- t1[-(1:2)] * t2[-(1:2)]
t3$name <- paste0(x, collapse = "_")
cbind(t3, t1[1])
}, simplify = FALSE)
You can then rbind it to original data.
new_data <- rbind(df1, do.call(rbind, data))

How to create a table with 1st row and 1st column as the header?

I want to create a data table in R with some data that I had already obtained. However, I'm not sure how to put those data into a table form because that required some skill to put he return data, monthlyRet, into the table according to their month respectively. The diagram attached below is the table format that I want, the data inside also need to be included.
Please note that the data for No.of.Positive and No.of.Negative are started from Aug instead of Jan due to the starting date in getSymbols. Hence, I wish the No.of.Positive and No.of.Negative can be arranged in the table created from Jan to Dec as shown in the diagram below.
The code below is how I obtained my data.
library(quantmod)
prices <-
getSymbols("^NDX", src = 'yahoo', from = "2009-07-01", to = "2019-08-01",
periodicity = "monthly", auto.assign = FALSE, warnings = FALSE)[,4]
return <- diff(log(prices))
r <- na.omit(exp(return)-1)
monthlyRet <- as.numeric(r)
meanMonthlyRet <- c()
No.of.Positive <- c()
No.of.Negative <- c()
for (j in 1:12){
Group <- c()
count_pos=0
count_neg=0
for (i in seq(j,length(monthlyRet),12)){
Group[i] <- monthlyRet[i]
if(monthlyRet[i]>0){
count_pos <- count_pos+1
}
else if(monthlyRet[i]<0){
count_neg <- count_neg+1
}
}
meanMonthlyRet[j] <- mean(Group, na.rm=TRUE)
Positive=0
Negative=0
if(meanMonthlyRet[j]>0){
Positive=count_pos
Negative=10-Positive
}
else if (meanMonthlyRet[j]<0){
Negative=count_neg
Positive=10-Negative
}
No.of.Positive[j] <- Positive
No.of.Negative[j] <- Negative
}
# My data required in table #--------------------------------------------------
Year <- c(2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019)
Month <- c("Aug","Sep","Oct","Nov","Dec","Jan","Feb","Mar","Apr","May","Jun","Jul")
r
No.of.Positive
No.of.Negative
I hope I can obtain exactly the same table format and content as the diagram below (I manually created in excel). Further, if the start and end date in getSymbols are changed, I hope the data in the table will still be correct.
Here is a tidyverse solution for your problem.
library(quantmod)
library(tidyverse)
prices <- getSymbols("^NDX", src = 'yahoo', from = "2009-07-01",
to = "2019-08-01", periodicity = "monthly",
auto.assign = FALSE, warnings = FALSE)[,4]
r <- prices %>%
log %>%
diff %>%
exp %>%
{. - 1}
table_out <- r %>%
as.data.frame() %>%
rownames_to_column() %>%
set_names(c("date", "variable")) %>%
mutate(variable = (variable * 100) %>% round(2)) %>%
separate(date, c("year", "month", "day")) %>%
select(-day) %>%
spread(month, variable)
n_pos <- map_dbl(table_out, ~sum(. > 0, na.rm = T))
n_neg <- map_dbl(table_out, ~sum(. < 0, na.rm = T))
table_out <- table_out %>%
mutate_if(is.double, ~str_c(., "%")) %>%
rbind(n_pos, n_neg)
x <- nrow(table_out)
table_out[(x-1):x, "year"] <- c("No. of Positive","No. of Negative")
table_out

R error "do not know how to convert 'x' to class “POSIXlt”"

I was trying to use R to map with some data saved as 'csv' format. Date in my data is in YYYY-dd-mm format.I want to add some data information,like 'year''month',I used the following code:
effort_df <- effort_df %>%
mutate(year = year(date),
month = month(date))
after running the code, I get the following error:
Error in as.POSIXlt.default(x, tz = tz(x)) :
do not know how to convert 'x' to class “POSIXlt”
And the complete codes are as followings:
#Load packages
library(tidyverse) # for general data wrangling and plotting
library(furrr) # for parallel operations on lists
library(lubridate) # for working with dates
library(sf) # for vector data
library(raster) # for working with rasters
library(maps) # additional helpful mapping packages
library(maptools)
library(rgeos)
# World polygons from the maps package
world_shp <- sf::st_as_sf(maps::map("world", plot = FALSE, fill = TRUE))
# Load EEZ polygons
eezs <- read_sf("F:/data/shapefiles/World_EEZ_v10_20180221", layer = 'eez_v10') %>%
filter(Pol_type == '200NM') # select the 200 nautical mile polygon layer
# Specify location of data directory containing daily csv files.
data_dir <- ("F:/yjs/data/fishing_effort/fishing_effort")
# Create dataframe of filenames dates and filter to date range of interest
effort_files <- tibble(
file = list.files(paste0(data_dir, 'fishing_effort_byflag'),
pattern ='.csv', recursive = T, full.names = T),
date =ymd(str_extract(file,
pattern = '[[:digit:]]{4}-[[:digit:]]{2}-[[:digit:]]{2}')))
# Generate a vector of dates of interest using ymd from lubridate
effort_dates <- seq(ymd('2016-01-01'), ymd('2016-12-31'), by='days')
# Filter to files within our date range of interest
effort_files <- filter(effort_files, date %in% effort_dates)
# Read in data (uncomment to read in parallel)
plan(multisession)
effort_df <- furrr::future_map_dfr(effort_files$file, .f = read_csv)
class(effort_df$date)
# Add date information
effort_df <- effort_df %>%
mutate(year = year(date),
month = month(date))
# Specify new (lower) resolution in degrees for aggregating data
res <- 0.25
# Transform data across all fleets and geartypes
effort_df <- effort_df %>%
mutate(
lat_bin = lat_bin / 100,
lon_bin = lon_bin / 100,
lat_bin = floor(lat_bin/res) * res + 0.5 * res,
lon_bin = floor(lon_bin/res) * res + 0.5 * res)
# Re-aggregate the data to 0.25 degrees
effort_df <- effort_df %>%
group_by(date, year, month, lon_bin, lat_bin, flag, geartype) %>%
summarize(vessel_hours = sum(vessel_hours, na.rm = T),
fishing_hours = sum(fishing_hours, na.rm = T),
mmsi_present = sum(mmsi_present, na.rm = T))
# Aggregate data across all fleets and geartypes
effort_all <- effort_df %>%
group_by(lon_bin,lat_bin) %>%
summarize(fishing_hours = sum(fishing_hours, na.rm = T),
log_fishing_hours = log10(sum(fishing_hours, na.rm = T))) %>%
ungroup() %>%
mutate(log_fishing_hours = ifelse(log_fishing_hours <= 1, 1, log_fishing_hours),
log_fishing_hours = ifelse(log_fishing_hours >= 5, 5, log_fishing_hours)) %>%
filter(fishing_hours >= 24)
# Linear green color palette function
effort_pal <- colorRampPalette(c('#0C276C', '#3B9088', '#EEFF00', '#ffffff'),
interpolate = 'linear')
# Map fishing effort
p1 <- effort_all %>%
ggplot() +
geom_sf(data = world_shp, fill = '#374a6d', color = '#0A1738',size = 0.1) +
geom_sf(data = eezs,color = '#374a6d',alpha = 0.2,fill = NA,size = 0.1) +
geom_raster(aes(x=lon_bin,y=lat_bin,fill=log_fishing_hours)) +
scale_fill_gradientn(
"Fishing Hours",
na.value = NA,limits <- c(1, 5),colours = effort_pal(5),
labels <- c("10","100","1,000","10,000","100,000+"),values = scales::rescale(c(0, 1))) +
labs(fill = "Fishing hours (log scale)",
title = "Global fishing effort in 2016") +
guides(fill = guide_colourbar(barwidth = 10)) +
gfw_theme
How can I solve this issue?
data_dir <- ("F:/yjs/data/fishing_effort/fishing_effort")
you lose / at the end of the path.
The correct one should be
data_dir <- ("F:/yjs/data/fishing_effort/fishing_effort/")

Converting a raw data frame into workable time series

I have a spreadsheet documenting prices of 40 similar products at various dates. It looks like this.
date_1<-seq(as.Date("2010-01-01"), as.Date("2011-01-01"), length.out = 40)
date_2<-seq(as.Date("2011-01-01"), as.Date("2012-01-01"), length.out = 40)
date_3<-seq(as.Date("2012-01-01"), as.Date("2013-01-01"), length.out = 40)
date_4<-seq(as.Date("2013-01-01"), as.Date("2014-01-01"), length.out = 40)
date_5<-seq(as.Date("2014-01-01"), as.Date("2015-01-01"), length.out = 40)
date_6<-seq(as.Date("2015-01-01"), as.Date("2016-01-01"), length.out = 40)
price_1<-floor(seq(20, 50, length.out = 40))
price_2<-floor(seq(20, 60, length.out = 40))
price_3<-floor(seq(20, 70, length.out = 40))
price_4<-floor(seq(30, 80, length.out = 40))
price_5<-floor(seq(40, 100, length.out = 40))
price_6<-floor(seq(50, 130, length.out = 40))
data.frame(date_1,price_1,date_2,price_2,date_3,price_3,date_4,price_4,date_5,price_5,date_6,price_6)
The problem is, the columns representing dates and prices alternate (convenient for record keeping). How can I transform the above data to a new dataframe consisting solely of prices of these 40 products as rows, with dates as column names? This will generate a lot of NA's because the dates in each column differ but that's OK.
When working with time series data it is often helpful to have it in long form (one row per observation), even if your target output is wide (one row per time series). Here are three possible approaches to get it into long form, then widen:
1. base reshape()
To get long form, base reshape is definitely a powerful option. The following solution improves on the accepted solution because it works for any numbers of products and observations and eliminates an unnecessary step:
df <- data.frame(date_1,price_1,date_2,price_2,date_3,price_3,
date_4,price_4,date_5,price_5,date_6,price_6)
# no need to create an id variable
long_form <- reshape(df, # idvar="id" by default
varying = list(grep('date_',names(df), value=TRUE),
grep('price_',names(df), value=TRUE) ),
v.names=c("date","price"),
direction="long",
sep="_")
And reshape can also widen it. (We'll use spread in another approach below.)
wide_form <- reshape(long_form, drop='time', timevar='date', direction='wide')
2. data.table melt() and dcast() (likely faster on real dataset)
Make sure you have data.table v1.9.6 or later, which allows you to melt multiple columns.
library(data.table)
setDT(df)
melt.data.table(df[, prod_id := .I], # product id = original row number
measure.vars = list(grep('date_',names(df), value=TRUE),
grep('price_',names(df), value=TRUE) ),
variable.name = 'sequence',
value.name = c('date','price'),
id.vars = 'prod_id') -> long_form
In this case you don't use the sequence, so to get wide form is just:
dcast.data.table(long_form[, !'sequence', with=FALSE],
value.var = 'price', # optional (function guesses correctly)
prod_id ~ date) -> wide_form
3. tidyr & dplyr split-apply-combine (easy to understand)
It doesn't require the mental gymnastics that reshape does (at least for me). It is a column-wise variant on the "split-apply-combine" paradigm.
library(dplyr); library(tidyr)
# Create long-form time series data
# Split table into sequenced prices and dates, then combine on product and sequence
full_join(
df %>%
select(starts_with('date_')) %>% #~~~~ Left side = date component ~~~~~~~~
mutate(prod_id = 1:nrow(df)) %>% #~ product id = original row number ~
gather(sequence, date, -prod_id) %>% #~ long form = 1 row per prod per seq ~
mutate(sequence = #~~~ Cols: product_id, sequence, date ~~~
sub('^date_(\\d+)$', '\\1', sequence) ) ,
df %>%
select(starts_with('price_')) %>% #~~~ Right side = price component ~~~~~~~
mutate(prod_id = 1:nrow(df)) %>% #~ ~
gather(sequence, price, -prod_id) %>% #~ same idea ~
mutate(sequence = #~~ Cols: product_id, sequence, price ~~~
sub('^price_(\\d+)$', '\\1', sequence) )
) -> long_form
In this case you don't need the sequence, so to get to wide form it's simply:
long_form %>% select(-sequence) %>% spread(date, price) -> wide_form
as noted by others above.
Here is one way I came up with using dplyr/tidyr packages:
library(tidyr)
library(dplyr)
date_1<-seq(as.Date("2010-01-01"), as.Date("2011-01-01"), length.out = 40)
date_2<-seq(as.Date("2011-01-01"), as.Date("2012-01-01"), length.out = 40)
date_3<-seq(as.Date("2012-01-01"), as.Date("2013-01-01"), length.out = 40)
date_4<-seq(as.Date("2013-01-01"), as.Date("2014-01-01"), length.out = 40)
date_5<-seq(as.Date("2014-01-01"), as.Date("2015-01-01"), length.out = 40)
date_6<-seq(as.Date("2015-01-01"), as.Date("2016-01-01"), length.out = 40)
price_1<-floor(seq(20, 50, length.out = 40))
price_2<-floor(seq(20, 60, length.out = 40))
price_3<-floor(seq(20, 70, length.out = 40))
price_4<-floor(seq(30, 80, length.out = 40))
price_5<-floor(seq(40, 100, length.out = 40))
price_6<-floor(seq(50, 130, length.out = 40))
df <- data.frame(date_1,price_1,date_2,price_2,date_3,price_3,date_4,price_4,date_5,price_5,date_6,price_6)
dates <- df[, grep('date', names(df))]
dates <- dates %>% gather(date_type, date) %>% select(-date_type)
prices <- df[, grep('price', names(df))]
prices <- prices %>% gather(price_type, price) %>% select(-price_type)
df <- cbind(dates, prices)
Then, to spread dates to columns and prices to rows, you can do something like this:
df <- arrange(df, price)
df <- spread(df, date, price)
Using baseR and tidyr you could do:
library(tidyr)
#add an id to identify the products
df$id=1:40
#transform the data to a long format
long_data <- reshape(df,idvar="id",varying=list(paste0("date_",1:6),paste0("price_",1:6)),v.names=c("date","price"),direction="long",sep="_")
long_data <- long_data[,!grepl("time",colnames(long_data))]
#put it back to a wide format
wide_data <- spread(long_data,date,price)

hourly sums with dplyr with zeros for empty hours

I have a dataset similar to the format of "my_data" below, where each line is a single count of an event. I want to obtain a summary of how many events happen in every hour. I would like to have every hour with no events be included with a 0 for its "hourly_total" value.
I can achieve this with dplyr as shown, but the empty hours are dropped instead of being set to 0.
Thank you!
set.seed(123)
library(dplyr)
library(lubridate)
latemail <- function(N, st="2012/01/01", et="2012/1/31") {
st <- as.POSIXct(as.Date(st))
et <- as.POSIXct(as.Date(et))
dt <- as.numeric(difftime(et,st,unit="sec"))
ev <- sort(runif(N, 0, dt))
rt <- st + ev
}
my_data <- data_frame( fake_times = latemail(25),
count = 1)
my_data %>% group_by( rounded_hour = floor_date(fake_times, unit = "hour")) %>%
summarise( hourly_total = sum(count))
Assign your counts to an object
counts <- my_data %>% group_by( rounded_hour = floor_date(fake_times, unit = "hour")) %>%
summarise( hourly_total = sum(count))
Create a data frame with all the necessary hours
complete_data = data.frame(hour = seq(floor_date(min(my_data$fake_times), unit = "hour"),
floor_date(max(my_data$fake_times), unit = "hour"),
by = "hour"))
Join to it and fill in the NAs.
complete_data %>% group_by( rounded_hour = floor_date(hour, unit = "hour")) %>%
left_join(counts) %>%
mutate(hourly_total = ifelse(is.na(hourly_total), 0, hourly_total))

Resources