Combine outputs of a function for each index in a for loop in R - r

I have created a function which completes a calculation based on data from two data frames for an individual.
I want to complete that function for each individual and combine all the outputs in a data frame and export to .csv
Currently the output .csv only has data for person 34, none of the other.
I've noted that when I run this it creates an object ID, which is just the numeric 34. It seems to be only holding on to the last ID in data$ID.
How can I create an output with results for all persons?
library(dplyr)
library(lubridate)
library(date)
screen_function = function(x){
# Select each person and get necessary inputs
data = data%>%filter(ID == x)
demogs = demogs %>% filter (P_ID == x)
data$Age = demogs$Age
data$result = data$test * data$Age
data$Date = as.Date(data$Date,format='%d/%m/%Y') # ensures date column is in correct format
# only include tests from most recent 12-24 months and only proceed if test in last 12 months
Recent_12m = data %>% filter(between(Date, today() - years(1), today()))
Recent_24m = data %>% filter(between(Date, today() - years(2), today()))
if ((nrow(Recent_12m)) > 0) {
data = rbind(Recent_12m,Recent_24m)
Recent_12m$min_date = min(Recent_12m$Date)
Recent_12m$Date = as.Date(Recent_12m$Date,format='%d/%m/%Y')
Recent_12m$min_date = as.Date(Recent_12m$min_date,format='%d/%m/%Y')
Recent_24m$min_date = min(Recent_24m$Date)
Recent_24m$Date = as.Date(Recent_24m$Date,format='%d/%m/%Y')
Recent_24m$min_date = as.Date(Recent_24m$min_date,format='%d/%m/%Y')
Recent_12m$Period = interval(Recent_12m$min_date, Recent_12m$Date)
Recent_12m$Years = as.numeric(Recent_12m$Period, unit = "years")
Recent_24m$Period = interval(Recent_24m$min_date, Recent_24m$Date)
Recent_24m$Years = as.numeric(Recent_24m$Period, unit = "years")
# Latest result
Last = filter(Recent_12m, (Recent_12m$Date == max(Date)))
# linear regression model
Reg_12month <- lm(result ~ Years, data=Recent_12m)
Reg_24month <- lm(result ~ Years, data=Recent_24m)
info = c(x, round(Last$result, digits=1), round(Reg_12month$coefficients["Years"], digits = 1), round(Reg_24month$coefficients["Years"], digits = 1))
newdf = data.frame(matrix(0, ncol = 4))
colnames(newdf) = c("ID", "Latest result", "Trend 12month", "Trend 24 month")
newdf= rbind(newdf, info)
write.csv(newdf, "filepath.csv")
}
}
Date= sample(seq(as.Date('2019/11/01'), as.Date('2020/11/01'), by="day"), 12)
ID= c(12,12,12,450,450,450,1,1,1,34,34,34)
test= rnorm(12, mean=150, sd=60)
data= data.frame(ID, Date, test)
P_ID = c(1,12,34,450)
Age = c(50,45,60,72)
demogs = data.frame(P_ID, Age)
persons = unique(data$ID)
for(ID in persons){
screen_function(paste("", ID,"", sep=""))
}
Created on 2020-11-16 by the reprex package (v0.3.0)

So, I've got around this by using a pre-made .csv, instead of creating a new dataframe. The .csv just contains a single row of 4 columns, with random entries in each cell.
newdf= read.csv(file = "filepath.csv")
info = c(x, round(Last$result, digits=1), round(Reg_12month$coefficients["Years"], digits = 1),
round(Reg_24month$coefficients["Years"], digits = 1))
newdf= rbind(Summary, patient_info)
colnames(newdf) = c("ID", "Latest result", "Trend 12month", "Trend 24 month")
newdf= distinct(newdf, ID, .keep_all = TRUE)
write.csv(Summary, "filepath.csv", row.names = FALSE)}}

Related

Creating all possible variable combinations in R

I am having a daily dataset of 4 parameters which I have converted into monthly data using following code
library(zoo)
library(hydroTSM)
library(lubridate)
library(tidyverse)
set.seed(123)
df <- data.frame("date"= seq(from = as.Date("1983-1-1"), to = as.Date("2018-12-31"), by = "day"),
"Parameter1" = runif(length(seq.Date(as.Date("1983-1-1"), as.Date("2018-12-31"), "days")), 15, 35),
"Parameter2" = runif(length(seq.Date(as.Date("1983-1-1"), as.Date("2018-12-31"), "days")), 11, 29),
"Parameter3" = runif(length(seq.Date(as.Date("1983-1-1"), as.Date("2018-12-31"), "days")), 50, 90),
"Parameter4" = runif(length(seq.Date(as.Date("1983-1-1"), as.Date("2018-12-31"), "days")), 0, 27))
Monthly_data <- daily2monthly(df, FUN=mean, na.rm=TRUE)
After that, I have reshaped it to represent each column as month using following code
#Function to convert month abbreviation to a numeric month
mo2Num <- function(x) match(tolower(x), tolower(month.abb))
Monthly_data %>%
dplyr::as_tibble(rownames = "date") %>%
separate("date", c("Month", "Year"), sep = "-", convert = T) %>%
mutate(Month = mo2Num(Month))%>%
tidyr::pivot_longer(cols = -c(Month, Year)) %>%
pivot_wider(names_from = Month, values_from = value, names_prefix = "Mon",
names_sep = "_") %>%
arrange(name)
Now, I want to create parameter combinations like Parameter1 * Parameter2, Parameter1 * Parameter3, Parameter1 * Parameter4, Parameter2 * Parameter3, Parameter2 * Parameter4, Parameter3 * Parameter4 which will be added to the pivoted monthly data as rbind. The new dataframe Parameter1 * Parameter2 means to multiply their monthly values and then rbind to the above result. Likewise for all other above said combinations. How can I achieve this?
You can use this base R approach using combn assuming data is present for all the years for all parameters where df1 is the dataframe from the above output ending with arrange(name).
data <- combn(unique(df1$name), 2, function(x) {
t1 <- subset(df1, name == x[1])
t2 <- subset(df1, name == x[2])
t3 <- t1[-(1:2)] * t2[-(1:2)]
t3$name <- paste0(x, collapse = "_")
cbind(t3, t1[1])
}, simplify = FALSE)
You can then rbind it to original data.
new_data <- rbind(df1, do.call(rbind, data))

How to create a table with 1st row and 1st column as the header?

I want to create a data table in R with some data that I had already obtained. However, I'm not sure how to put those data into a table form because that required some skill to put he return data, monthlyRet, into the table according to their month respectively. The diagram attached below is the table format that I want, the data inside also need to be included.
Please note that the data for No.of.Positive and No.of.Negative are started from Aug instead of Jan due to the starting date in getSymbols. Hence, I wish the No.of.Positive and No.of.Negative can be arranged in the table created from Jan to Dec as shown in the diagram below.
The code below is how I obtained my data.
library(quantmod)
prices <-
getSymbols("^NDX", src = 'yahoo', from = "2009-07-01", to = "2019-08-01",
periodicity = "monthly", auto.assign = FALSE, warnings = FALSE)[,4]
return <- diff(log(prices))
r <- na.omit(exp(return)-1)
monthlyRet <- as.numeric(r)
meanMonthlyRet <- c()
No.of.Positive <- c()
No.of.Negative <- c()
for (j in 1:12){
Group <- c()
count_pos=0
count_neg=0
for (i in seq(j,length(monthlyRet),12)){
Group[i] <- monthlyRet[i]
if(monthlyRet[i]>0){
count_pos <- count_pos+1
}
else if(monthlyRet[i]<0){
count_neg <- count_neg+1
}
}
meanMonthlyRet[j] <- mean(Group, na.rm=TRUE)
Positive=0
Negative=0
if(meanMonthlyRet[j]>0){
Positive=count_pos
Negative=10-Positive
}
else if (meanMonthlyRet[j]<0){
Negative=count_neg
Positive=10-Negative
}
No.of.Positive[j] <- Positive
No.of.Negative[j] <- Negative
}
# My data required in table #--------------------------------------------------
Year <- c(2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019)
Month <- c("Aug","Sep","Oct","Nov","Dec","Jan","Feb","Mar","Apr","May","Jun","Jul")
r
No.of.Positive
No.of.Negative
I hope I can obtain exactly the same table format and content as the diagram below (I manually created in excel). Further, if the start and end date in getSymbols are changed, I hope the data in the table will still be correct.
Here is a tidyverse solution for your problem.
library(quantmod)
library(tidyverse)
prices <- getSymbols("^NDX", src = 'yahoo', from = "2009-07-01",
to = "2019-08-01", periodicity = "monthly",
auto.assign = FALSE, warnings = FALSE)[,4]
r <- prices %>%
log %>%
diff %>%
exp %>%
{. - 1}
table_out <- r %>%
as.data.frame() %>%
rownames_to_column() %>%
set_names(c("date", "variable")) %>%
mutate(variable = (variable * 100) %>% round(2)) %>%
separate(date, c("year", "month", "day")) %>%
select(-day) %>%
spread(month, variable)
n_pos <- map_dbl(table_out, ~sum(. > 0, na.rm = T))
n_neg <- map_dbl(table_out, ~sum(. < 0, na.rm = T))
table_out <- table_out %>%
mutate_if(is.double, ~str_c(., "%")) %>%
rbind(n_pos, n_neg)
x <- nrow(table_out)
table_out[(x-1):x, "year"] <- c("No. of Positive","No. of Negative")
table_out

R script works on one data file, but not on another one of the same type

https://www.dropbox.com/s/yjvesdycs6n91ee/pecfas_100233%20%5Bmi-%20washtenaw%20community%20health%20orgnization%5D.csv?dl=0
https://www.dropbox.com/s/v41dpf64rhysqum/pecfas_100203%20%5Bmi-%20southwest%20counseling%20solutions%5D.csv?dl=0
Above are links to two data files. Below is my R script. Briefly, the R script opens these files, deletes several unnecessary variables, creates other variables, and generally makes the dataset closer to something that I can work with. My problem is that while it is working on the Washtenaw file and 14 other similar files, it is NOT working with the Southwest Counseling file.
The problem arises towards the end of the code with the variables rec.cols, rec.colnames, and fas.
When I try to use rec.colnames as names, I get the error that the names attribute contains more items than the vector.
I understand the concept of this issue: rec.colnames has roughly 70 items, and rec.cols is supposedly 0 columns and 1 row. However, this should not be the case. I do not have this problem with any other file. I would like someone to help me figure out why this is happening, and also what I can do to fix it.
library(data.table)
library(magrittr)
library(dplyr)
library(stringr)
library(tidyr)
# even after the item-level data is deleted from each data set, when all of the data sets are merged, the merged file is still too large to manipulate efficiently
# however, only the data from the initial and last assessments within the year of interest are needed for the reports
# the function below only deletes assessment data that are not from the initial or last assessments
del.mid <- function(file){
lower <- str_match(pattern = "cafas|pecfas", string = file)
upper <- toupper(lower)
fas <- fread(input = paste0("data/", lower, "/items-del/", file), sep = ",", header = TRUE, na.strings = "", stringsAsFactors = FALSE, colClasses = "character", fill = TRUE, data.table = FALSE)
#
# some cases have assessments entered that do not have any scores
# remove all data for these assessments so that they are not used as the initial most recent assessment
#
fas.long <- fas %>%
gather(key = key, value = value, starts_with(paste0(upper, "_E")), na.rm = FALSE) %>%
separate(col = key, into = c("scale", "time", "var"), sep = "_", extra = "merge") %>%
spread(key = var, value = value)
other.vars <- c("episodeNum", "clientAge", "assessID", "VersionNum", "SA_SoftwareID", "SA_ID", "Prog_SoftwareID", "Prog_ID", "assessDate", "daysSinceEpisodeStart", "status", "isLocked", "isDeleted", "isAmendment", "adminDesc", "adminMonths", "adminOtherDesc", "timePeriod", "noSubsequent", "language", "youthLivArrange", "youthLivArrangeIsOther", "youthOtherLivArrange", "assessorSoftwareID", "Primary_CaregiverID", "Primary_CaregiverRelation")
# score variables for the CAFAS and PECFAS are different
if(lower == "cafas") score.vars <- c("TotalScore", "TotalStrengths", "Tier", "ChildMgmtSkills", "PervasBhImpair", "RiskPsychotic", "RiskSevereSubstance", "RiskSuicideAttempt", "RiskSuicideIdeation", "RiskAggresive", "RiskSexual", "RiskFire", "RiskRunaway", "RiskExceedPrimaryCG", "RiskExceedNonCust", "RiskExceedSurrogate", "TotalSevere", "TotalScoreDiff", "MeaningfulReliableDiff", "SevereDiff", "PervasiveImpairDiff", "IndicatorImprove", "SchoolScore", "SchoolStrengthTotal", "HomeScore", "HomeStrengthTotal", "CommunityScore", "CommunityStrengthTotal", "BehaviorScore", "BehaviorStrengthTotal", "MoodScore", "MoodStrengthTotal", "SelfHarmScore", "SelfHarmStrengthTotal", "SubstanceScore", "SubstanceStrengthTotal", "ThinkingScore", "ThinkingStrengthTotal", "CG_P_Needs", "CG_P_Support", "Primary_CG_StrengthTotal", "NonCust_CaregiverID", "NonCust_CaregiverRelation", "NC_CG_Needs", "NC_CG_Support", "NC_CG_StrengthTotal", "Surr_CaregiverID", "Surr_CaregiverRelation", "Surr_CG_Needs", "Surr_CG_Support", "Surr_CG_StrengthTotal")
if(lower == "pecfas") score.vars <- c("TotalScore", "TotalStrengths", "Tier", "ChildMgmtSkills", "PervasBhImpair", "RiskThinking", "RiskSuicideAttempt", "RiskSuicideIdeation", "RiskAggresive", "RiskSexual", "RiskFire", "RiskRunaway", "RiskDelinquent", "RiskExceedPrimaryCG", "RiskExceedNonCust", "RiskExceedSurrogate", "TotalSevere", "TotalScoreDiff", "MeaningfulReliableDiff", "SevereDiff", "PervasiveImpairDiff", "IndicatorImprove", "SchoolScore", "SchoolStrengthTotal", "HomeScore", "HomeStrengthTotal", "CommunityScore", "CommunityStrengthTotal", "BehaviorScore", "BehaviorStrengthTotal", "MoodScore", "MoodStrengthTotal", "SelfHarmScore", "SelfHarmStrengthTotal", "ThinkingScore", "ThinkingStrengthTotal", "CG_P_Needs", "CG_P_Support", "Primary_CG_StrengthTotal", "NonCust_CaregiverID", "NonCust_CaregiverRelation", "NC_CG_Needs", "NC_CG_Support", "NC_CG_StrengthTotal", "Surr_CaregiverID", "Surr_CaregiverRelation", "Surr_CG_Needs", "Surr_CG_Support", "Surr_CG_StrengthTotal")
fas.long[fas.long[, score.vars] %>% is.na %>% rowSums == length(score.vars), other.vars] <- NA
fas.wide <- fas.long %>%
unite(col = scaletime, c(scale, time), sep = "_") %>%
gather(key = var, value = value, adminDesc:youthOtherLivArrange) %>%
unite(col = key, c(scaletime, var), sep = "_") %>%
spread(key = key, value = value, fill = NA)
fas <- fas.wide[, names(fas)]
rm(fas.long, fas.wide, other.vars)
# only interested in cases with assessments within the year of interest for report
# year of interest for report: 10/1/2013 to 9/30/2016
start.date <- as.Date("2009-10-1", format = "%Y-%m-%d")
end.date <- as.Date("2016-9-30", format = "%Y-%m-%d")
# convert date variables to date type
fas[, grep(pattern = "Date", x = names(fas))] <- lapply(X = fas[, grep(pattern = "Date", x = names(fas))], FUN = as.Date, format = "%m/%d/%Y")
#
# for cases with revised initial assessments, replace initial data with revised initial data
#
# create variables that indicate whether a revised assessment is present or absent
# some cases have multiple initial assessments, so check all of them for revisions
# variable "max.e" reflects the maximum number of initial assessments present in the data sets
seq.e <- grep(pattern = "E[1-9]+TInitial_assessDate", x = names(fas), value = TRUE) %>% regmatches(., gregexpr("[0-9]+", .)) %>% unlist
max.e <- seq.e %>% max %>% as.numeric
fas[, sapply(X = seq.e, FUN = function(x) paste0("e", x, "tr.initial"))] <- FALSE
# function to replace initial assessment data with revised data
for(num in seq.e){
datecol <- paste0(upper, "_E", num, "TRInitial_assessDate")
logcol <- paste0("e", num, "tr.initial")
if(length(grep(pattern = paste0("E", num, "TRInitial"), names(fas))) > 0) fas[(fas[ , datecol] >= start.date) & (fas[, datecol] <= end.date) & (!is.na(fas[ , datecol])), logcol] <- TRUE
fas[fas[, logcol] == TRUE, grep(pattern = paste0("E", num, "TInitial"), x = names(fas))] <- fas[fas[, logcol] == TRUE, grep(pattern = paste0("E", num, "TRInitial"), x = names(fas))]
}
rm(num, datecol, logcol)
# delete revised assessment data columns because they are redundant
# prevents them from being considered as a recent assessment
if(length(grep(pattern = paste0("E[1-", max.e, "]TRInitial"), names(fas))) > 0)
fas <- fas[, -grep(pattern = paste0("E[1-", max.e, "]TRInitial"), names(fas))]
#
# create variables to indicate the presence or absence of an initial assessment
#
fas[, sapply(X = seq.e, FUN = function(x) paste0("e", x, "t.initial"))] <- FALSE
for(num in seq.e){
datecol <- paste0(upper, "_E", num, "TInitial_assessDate")
fas[fas[, datecol] >= start.date & fas[, datecol] <= end.date & !is.na(fas[, datecol]), paste0("e", num, "t.initial")] <- TRUE
fas[rowSums(is.na(fas[paste0(upper, "_E", num, "TInitial_", score.vars)])) == length(score.vars), paste0("e", num, "t.initial")] <- FALSE # do not consider an assessment as present if all the scores are missing
}
rm(num, datecol)
# some cases have multiple initial assessments within the year of interest
# use the earliest one
# create a variable that indicates which initial assessment is used
fas$et.initial <- 0
for(num in rev(seq.e)){
fas$et.initial[fas[, paste0("e", num, "t.initial")] == TRUE] <- as.numeric(num)
}
rm(num)
#
# create a set of initial assessment variables that applies to all children
#
# delete cases without an initial assessment
fas <- fas[fas$et.initial != 0, ]
init.cols <- fas %>% select(matches(paste0(upper, "_E1TInitial")))
init.colnames <- grep(pattern = paste0(upper, "_E1TInitial"), x = names(fas), value = TRUE)
init.colnames <- paste0(upper, "_", gsub(pattern = paste0(upper, "_E1T"), replacement = "", x = init.colnames))
names(init.cols) <- init.colnames
fas <- cbind(fas, init.cols)
rm(init.cols, init.colnames)
for(num in seq.e[2:length(seq.e)]){
fas[fas$et.initial == as.numeric(num), grep(pattern = paste0(upper, "_Initial"), x = names(fas))] <- fas[fas$et.initial == as.numeric(num), grep(pattern = paste0(upper, "_E", num, "TInitial"), x = names(fas))]
}
#
# create a set of most recent assessment variables that applies to all children
#
# create variables that indicate number of days that have passed since the initial assessment for each assessment period
days <- lapply(X = fas[, grep(pattern = "assessDate", names(fas))], FUN = function(x) x - fas[, paste0(upper, "_Initial_assessDate")]) %>% unlist %>% matrix(nrow = nrow(fas), byrow = FALSE) %>% data.frame
names(days) <- grep(pattern = "assessDate", x = names(fas), value = TRUE) %>% sub(pattern = paste0(upper, "_"), replacement = "", x = .) %>% sub(pattern = "_assessDate", replacement = "", x = .) %>% paste(., "_days_pass", sep = "")
fas <- cbind(fas, days)
rm(days)
fas$Initial_days_pass <- NULL
# replace 0 or negative days passed with NA
fas[, grep(pattern = "days_pass", x = names(fas))][fas[, grep(pattern = "days_pass", names(fas))] <= 0] <- NA
# the code following the chunk below creates variables for the most recent assessment data
# it is easier to execute the code if cases with only an initial assessment are not in the data set
# add these cases back to the data set later
# some data sets do not have any cases with a most recent assessment
if((rowSums(is.na(fas[grep(pattern = "days_pass", names(fas), value = TRUE)])) == ncol(fas[grep(pattern = "days_pass", names(fas), value = TRUE)])) %>% sum != nrow(fas)) {
init.only <- fas[which(rowSums(is.na(fas[grep(pattern = "days_pass", names(fas), value = TRUE)])) == ncol(fas[grep(pattern = "days_pass", names(fas), value = TRUE)])), ]
fas <- fas[-which(rowSums(is.na(fas[grep(pattern = "days_pass", names(fas), value = TRUE)])) == ncol(fas[grep(pattern = "days_pass", names(fas), value = TRUE)])), ]
# calculate the max days between the initial and most recent assessments
fas$rec_days <- apply(X = fas[grep(pattern = "days_pass", x = names(fas), value = TRUE)], MARGIN = 1, FUN = max, na.rm = TRUE)
# create a variable that indicates the time period of the most recent assessment
days.pass.cols <- fas[grep(pattern = "days_pass", x = names(fas), value = TRUE)]
fas$rec_time <- colnames(days.pass.cols)[apply(X = days.pass.cols, MARGIN = 1, FUN = which.max)] %>% sub(pattern = "_days_pass", replacement = "", x = .)
rm(days.pass.cols)
# create most recent assessment variables
rec.cols <- apply(X = fas, MARGIN = 1, FUN = function(x) x[grep(pattern = paste0(upper, "_Recent_"), names(x))] <- x[grep(pattern = paste(paste0(upper, "_"), x["rec_time"], sep = ""), names(x))]) %>% t %>% data.frame
rec.colnames <- grep(pattern = paste0(upper, "_Initial"), x = names(fas), value = TRUE) %>% gsub(pattern = "Initial", replacement = "Recent", x = .)
names(rec.cols) <- rec.colnames
fas <- cbind(fas, rec.cols)
rm(rec.cols)
}
#
# limit data set to background variables, initial, and most recent assessments
# add cases only with initial assessment back to data set
#
fas <- fas[, c(1:which(colnames(fas) == "E1_endDate"), which(colnames(fas) == "e1tr.initial"):ncol(fas))]
# some data sets do not have any cases with a most recent assessment
if((rowSums(is.na(fas[grep(pattern = "days_pass", names(fas), value = TRUE)])) == ncol(fas[grep(pattern = "days_pass", names(fas), value = TRUE)])) %>% sum != nrow(fas)) {
init.only <- init.only[, c(1:which(colnames(init.only) == "E1_endDate"), which(colnames(init.only) == "e1tr.initial"):ncol(init.only))]
fas <- bind_rows(fas, init.only)
rm(init.only)
}
# add filename to data sets
fas$filename <- file
write.csv(fas, paste0("data/", lower, "/bref/", gsub(pattern = "\\.", replacement = "_bref.", file)), row.names = FALSE)
}
# delete assessment data that are not from the initial or last assessments from all cafas data sets
cafas.files <- list.files("data/cafas/items-del", pattern = ".csv")
for (file in cafas.files){
del.mid(file)
}
# delete assessment data that are not from the initial or last assessments from all pecfas data sets
pecfas.files <- list.files("data/pecfas/items-del", pattern = ".csv")
for (file in pecfas.files){
del.mid(file)
}

How to run a changepoint analysis on multiple time series with dplyr

I'm collecting time series data from Wikipedia and want to run a change-point analysis on each time series using dplyr. But when I do so I get an error saying the data need to be numeric, even though the class function states it is numeric. Hope you can help.
library(changepoint)
library(dplyr)
library(pageviews)
library(data.table)
articles <- c("Rugby_union", "Football")
foo <- function(x){article_pageviews(project = "en.wikipedia",
article = x,
start = as.Date('2017-01-01'),
end = as.Date("2017-12-31")
, user_type = "user", platform = c("mobile-web"))
}
output<-articles %>% foo
output %>%
select(article, views) %>%
do(cpt.mean(.))
class(output$views)
library(changepoint)
library(dplyr)
library(pageviews)
articles <- c("Rugby_union", "Football")
foo <- function(x){article_pageviews(project = "en.wikipedia", article = x,
start = as.Date('2017-01-01'),
end = as.Date("2017-12-31"),
user_type = "user", platform = c("mobile-web"))
}
output <- articles %>%
foo
df <- as.data.frame(table(output$article))
output1 <- output %>%
dplyr::select(article, views) %>%
dplyr::filter(article == df[1,1])
output2 <- output %>%
dplyr::select(article, views) %>%
dplyr::filter(article == df[2,1])
q <- floor((min(length(output1$views), length(output2$views)))/2 + 1)
cp1 <- changepoint::cpt.mean(data = output1$views, Q = q, method = "BinSeg", penalty
= "SIC")
plot(cp1)
cp2 <- changepoint::cpt.mean(data = output2$views, Q = q, method = "BinSeg", penalty
= "SIC")
plot(cp2)

Precipitation values for every 5 minutes to hourly summaries in R

I am trying to get the total precipitation values for every hour from a personal weather station I have using the weatherData package. The problem I have is that the data is collected every five minutes and the values repeat themselves until there is a change in precipitation value. I have tried the 'duplicated' function but I get a large number of data removed when there is no precipitation which makes it hard for me to get a summary of the hourly precipitation.
Please see code below
## Load required libraries
library(weatherData)
library(ggplot2)
library(scales)
library(plyr)
library(reshape2)
library(gridExtra)
library(lubridate)
library(weathermetrics)
library(zoo)
# Get data for PWS using weatherData package
pws <- getWeatherForDate("IPENANGB2", "2014-09-01","2014-09-30", station_type = "id",opt_detailed=T, opt_custom_columns=T, custom_columns=c(1,2,6,7,10))
# Rename columns
colnames(pws)<-c("time","time1","tempc","wdd","wspd","prcp")
## Adding date columns
pws$time<-as.POSIXct(pws$time1,format="%Y-%m-%d %H:%M:%S",tz="Australia/Perth")
pws$year <- as.numeric(format(pws$time,"%Y"))
pws$date <-as.Date(pws$time,format="%Y-%m-%d",tz="Australia/Perth")
pws$year <- as.numeric(as.POSIXlt(pws$date)$year+1900)
pws$month <- as.numeric(as.POSIXlt(pws$date)$mon+1)
pws$monthf <- factor(pws$month,levels=as.character(1:12),labels=c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"),ordered=TRUE)
pws$weekday <- as.POSIXlt(pws$date)$wday
pws$weekdayf <- factor(pws$weekday,levels=rev(0:6),labels=rev(c("Mon","Tue","Wed","Thu","Fri","Sat","Sun")),ordered=TRUE)
pws$yearmonth <- as.yearmon(pws$date)
pws$yearmonthf <- factor(pws$yearmonth)
pws$week <- as.numeric(format(as.Date(pws$date),"%W"))
pws$weekf<- factor(pws$week)
pws$jday<-yday(pws$date)
pws$hour <- as.numeric(format(strptime(pws$time, format = "%Y-%m-%d %H:%M"),format = "%H"))
pws$min <- as.numeric(format(strptime(pws$time, format = "%Y-%m-%d %H:%M"),format = "%M"))
# Remove duplicate values
pws.df <- pws[!duplicated(pws$prcp),]
Assuming you want to get hourly averages of tempc, wdd, wspd, prcp:
# used packages
library(weatherData)
library(lubridate)
library(dplyr)
library(stringr)
# read data
pws <- getWeatherForDate("IPENANGB2",
"2014-09-01",
"2014-09-30",
station_type = "id",
opt_detailed = T,
opt_custom_columns = T,
custom_columns = c(1, 2, 6, 7, 10))
# rename columns
colnames(pws) <- c("time", "time1", "tempc", "wdd", "wspd", "prcp")
# cleaning dataset and adding some columns
useful_pws <-
pws %>%
select(2:6) %>%
filter(!str_detect(time1, "<br>")) %>%
mutate(time1 = ymd_hms(time1),
year = year(time1),
month = month(time1),
day = day(time1),
hour = hour(time1)) %>%
tbl_df()
# summarising dataset
useful_pws %>%
select(-time1) %>%
group_by(year, month, day, hour) %>%
summarise(tempc = mean(tempc, na.rm = TRUE),
wdd = mean(wdd, na.rm = TRUE),
wspd = mean(wspd, na.rm = TRUE),
prcp = mean(prcp, na.rm = TRUE))

Resources