How to sum variable by month/year in R? - r

library(dplyr)
library(plotly)
library(lubridate)
googlesearch <- read.csv("multiTimeline.csv", header = FALSE, stringsAsFactors = FALSE)
googlesearch2 <- googlesearch [-1, ]
googlesearch2 <- googlesearch2 [-1, ]
colnames(googlesearch2)[1] <- 'Date'
colnames(googlesearch2)[2] <- 'NumberofSearch'
googlesearch2$Date <- as.Date(googlesearch2$Date)
googlesearch2 <- googlesearch2 %>%
filter(Date > "2015-01-04" & Date < "2018-05-27")
googlesearch3 <- googlesearch2 %>%
transform(googlesearch2$Date, Date = as.Date(as.character(Date), "%Y-%m-%d"))
googlesearch3 <- googlesearch2 %>%
mutate(month = format(Date, "%m"), year = format(Date, "%Y")) %>%
group_by(Date, yearMon = as.yearmon(Date, "%m-%d-%Y"))
googlesearch3$Date <- as.numeric(googlesearch3$NumberofSearch)
googlesearch3 <- googlesearch3 %>%
mutate(month = format(Date, "%m"), year = format(Date, "%Y")) %>%
group_by(Date, yearMon = as.yearmon(Date, "%m-%d-%Y")) %>%
summarise(NumberofSearch_sum = sum(NumberofSearch))
data <- tbl_df(googlesearch3)
data %>%
group_by(yearMon) %>%
summarise(NumberofSearch_mon = sum(NumberofSearch))
I know this is messy.
I'm getting this error and I don't know why.Adding the sample code.
Error in summarise_impl(.data, dots) :
Evaluation error: invalid 'type' (character) of argument.

In lack of a reproducible example, try to replace the last code chunk of you sample code with:
library(hablar)
data %>%
retype() %>%
group_by(yearMon) %>%
summarise(NumberofSearch_mon = sum(NumberofSearch))
Maybe it works :)

Related

How to fill in missing value of a data.frame in R?

I have multiple columns that has missing values. I want to use the mean of the same day across all years while filling the missing data for each column. for example, DF is my fake data where I see missing values for the two columns (A & X)
library(lubridate)
library(tidyverse)
library(naniar)
set.seed(123)
DF <- data.frame(Date = seq(as.Date("1985-01-01"), to = as.Date("1987-12-31"), by = "day"),
A = sample(1:10,1095, replace = T), X = sample(5:15,1095, replace = T)) %>%
replace_with_na(replace = list(A = 2, X = 5))
To fill in Column A, i use the following code
Fill_DF_A <- DF %>%
mutate(Year = year(Date), Month = month(Date), Day = day(Date)) %>%
group_by(Year, Day) %>%
mutate(A = ifelse(is.na(A), mean(A, na.rm=TRUE), A))
I have many columns in my data.frame and I would like to generalize this for all the columns to fill in the missing value?
We can use na.aggregate from zoo
library(dplyr)
library(zoo)
DF %>%
mutate(Year = year(Date), Month = month(Date), Day = day(Date)) %>%
group_by(Year, Day) %>%
mutate(across(A:X, na.aggregate))
Or if we prefer to use conditional statements
DF %>%
mutate(Year = year(Date), Month = month(Date), Day = day(Date)) %>%
group_by(Year, Day) %>%
mutate(across(A:X, ~ case_when(is.na(.)
~ mean(., na.rm = TRUE), TRUE ~ as.numeric(.))))

Group data by year and filter by month in R

I have a list of data frames with daily streamflow data.
I want to estimate the maximum daily flow from June to November every year for each data frame in the list that corresponds each of them to data in a station.
This is how the list of data frames looks:
and this is the code I am using:
#Peak mean daily flow summer and fall (June to November)
PeakflowSummerFall <- lapply(listDF,function(x){x %>% group_by(x %>% mutate(year = year(Date)))
%>% filter((x %>% mutate(month = month(Date)) >= 6) & (x %>% mutate(month = month(Date)) <= 11))
%>% summarise(max=max(DailyStreamflow, na.rm =TRUE))})
but I am having this error:
<error/dplyr_error>
Problem with `filter()` input `..1`.
x Input `..1` must be of size 1, not size 24601.
i Input `..1` is `&...`.
i The error occurred in group 1: Date = 1953-06-01, DailyStreamflow = 32, year = 1953.
Backtrace:
Run `rlang::last_trace()` to see the full context
Any solution to this problem?
#### This should give provide you with enough
#### sample data for answerers to work with
install.packages('purrr')
library(purrr)
sample_dat <- listDF %>%
head %>%
map( ~ head(.x))
dput(sample_dat)
#### With that being said...
#### You should flatten the data frame...
#### It's easier to work with...
install.packages('lubridate')
library(lubridate)
listDF %>%
plyr::ldply(rbind) %>%
mutate(month = floor_date(Date, unit = 'month')) %>%
filter(month(Date) > 5, month(Date) < 12) %>%
group_by(.id, month) %>%
dplyr::summarise(max_flow = max(DailyStreamflow)) %>%
split(.$.id)
Given the posted image of the data structure, the following might work.
library(lubridate)
library(dplyr)
listDF %>%
purrr::map(function(x){
x %>%
filter(month(Date) >= 6 & month(Date) <= 11) %>%
group_by(year(Date)) %>%
summarise(Max = max(DailyStreamflow, na.rm = TRUE), .groups = "keep")
})
Test data creation code.
fun <- function(year, n){
d1 <- as.Date(paste(year, 1, 1, sep = "-"))
d2 <- as.Date(paste(year + 10, 12, 31, sep = "-"))
d <- seq(d1, d2, by = "day")
d <- sort(rep(sample(d, n, TRUE), length.out = n))
flow <- sample(10*n, n, TRUE)
data.frame(Date = d, DailyStreamflow = flow)
}
set.seed(2020)
listDF <- lapply(1:3, function(i) fun(c(1953, 1965, 1980)[i], c(24601, 13270, 17761)[i]))
str(listDF)
rm(fun)

Assigning floor date in groupby fails

this is my code and I have a problem with groupby :
library(dplyr)
library(lubridate)
df <- read.xlsx("Data.xlsx", sheet = "Sector-STOXX600", startRow = 2,colNames = TRUE, detectDates = TRUE, skipEmptyRows = FALSE)
df[2:19] <- data.matrix(df[2:19])
percent_change2 <- function(x)last(x)/first(x) - 1
monthly_return <- df %>%
group_by(gr = floor_date(Date, unit = "month")) %>%
summarize_at(vars(-Date, -gr), percent_change2) %>%
ungroup() %>%
select(-gr) %>%
as.matrix()
Indeed I have this error :
"Error in is_character(x) : object 'gr' not found"
Here is a sample of the dataset :
Date .SXQR .SXTR .SXNR .SXMR .SXAR .SX3R .SX6R .SXFR .SXOR .SXDR .SX4R .SXRR .SXER
1 2000-01-03 364.94 223.93 489.04 586.38 306.56 246.81 385.36 403.82 283.78 455.39 427.43 498.08 457.57
2 2000-01-04 345.04 218.90 474.05 566.15 301.13 239.24 374.64 390.41 275.93 434.92 414.10 476.17 435.72
UPDATE
volatility_function<- function(x)sqrt(252) * sd(diff(log(x))) * 100
annualized_volatility <- df %>%
mutate(Date=ymd(Date)) %>%
group_by(gr = floor_date(Date, unit = "year")) %>%
select(gr,everything()) %>%
summarize_at(vars(-Date, -gr), volatility_function) %>%
ungroup() %>% select(-gr) %>%
as.matrix()
head(annualized_volatility,5)
I tried what #NeslonGon told me to do, however I know get the same error on an another function, what should I do ?
The idea is that we don't need to summarise_at a grouped variable but use the Date to account for this. The select and mutate calls can be skipped. They're for convenience.
df %>%
mutate(Date=ymd(Date)) %>%
group_by(gr = floor_date(Date, unit = "month")) %>%
select(gr,everything()) %>%
summarize_at(vars(-Date), percent_change2) %>%
ungroup() %>%
select(-gr) %>%
as.matrix()

filtering intraday data R

I'm trying to filter intraday-data to include only certain period inside the day. Is there a trick in some packages to achieve this. Here is example data:
library(tibbletime)
example <- as.tibble(data.frame(
date = ymd_hms(seq(as.POSIXct("2017-01-01 09:00:00"), as.POSIXct("2017-01-02 20:00:00"), by="min")),
value = rep(1, 2101)))
I would like to include only 10:00:00 - 18:35:00 for each day, but can't achieve this nicely. My solution for now has been creating extra indic columns and then filter by them, but it hasn't worked well either.
You can use the function between() from data.table
example[data.table::between(format(example$date, "%H:%M:%S"),
lower = "10:00:00",
upper = "18:35:00"), ]
library(tibbletime)
library(tidyverse)
library(lubridate)
example <- as.tibble(data.frame(
date = ymd_hms(seq(as.POSIXct("2017-01-01 09:00:00"), as.POSIXct("2017-01-02 20:00:00"), by="min")),
value = rep(1, 2101)))
example %>%
mutate(time = as.numeric(paste0(hour(date),".",minute(date)))) %>%
filter(time >= 10 & time <= 18.35) %>%
select(-time)
This is pretty hacky but if you really want to stay in the tidyverse:
rng <- range((hms("10:00:00") %>% as_datetime()), (hms("18:35:00") %>% as_datetime()))
example %>%
separate(., date, into = c("date", "time"), sep = " ") %>%
mutate(
time = hms(time) %>% as_datetime(),
date = as_date(date)
) %>%
filter(time > rng[1] & time < rng[2]) %>%
separate(., time, into = c("useless", "time"), sep = " ") %>%
select(-useless)

Combine list of data frames with one column of characters

I am learning to get, cleaning and combining data. I am confused why in a loop rbind command result in returning 10 data instead of expected 30 data as when I combine it manually (i by i).
library(XML)
mergeal <- NULL
tabnums <- 3
for (i in 1:length(tabnums)) {
bnn <- paste0("http://www.ngchanmau.com/listing_browse.php?cur_page=",
tabnums[i], "&&coming=22-Oct-2015&coming=22-Oct-2015")
tem <- readHTMLTable(bnn, header=T, stringsAsFactors=F)
#data cleaning
ff <- tem[8] #wanted data
ff1 <- as.data.frame(ff)
ff2 <- ff1[ , 1] #get 1st col data only
ff3 <- unique(ff2)
ff4 <- ff3[c(2,5:13)] #wanted list only
#merging dataset
mergeal <- rbind(mergeal, ff4)
}
I've tried using list rbind list of data frames with one column of characters and numerics but still have the same result as above. Appreciate any help on what I missed, thanks.
I cleaned up the data cause I was bored.
library(plyr)
library(XML)
library(dplyr)
library(magrittr)
library(stringi)
library(tidyr)
library(lubridate)
answer =
data_frame(tabnums = 1:3) %>%
group_by(tabnums) %>%
do(.$tabnums %>%
paste0("http://www.ngchanmau.com/listing_browse.php?cur_page=",
., "&&coming=22-Oct-2015&coming=22-Oct-2015") %>%
readHTMLTable(header = T, stringsAsFactors = F) %>%
extract2(8)) %>%
ungroup %>%
select(V1) %>%
distinct %>%
mutate(V1 =
V1 %>%
stri_replace_all_fixed("Â", "\n") %>%
stri_replace_all_fixed("Type:", "\nType:") %>%
stri_replace_all_fixed("Time:", "\nTime:") %>%
stri_replace_all_fixed("Area:", "\nArea:") %>%
stri_split_fixed("\n")) %>%
unnest(V1) %>%
mutate(V1 = V1 %>% stri_trim) %>%
filter(V1 %>% stri_detect_regex("^There are currently") %>% `!`) %>%
filter(V1 != "") %>%
separate(V1, c("variable", "value"), sep = ":", fill = "left") %>%
mutate(variable = variable %>% mapvalues(NA, "Description"),
ID = variable %>% `==`("Description") %>% cumsum) %>%
spread(variable, value) %>%
mutate(Area = Area %>% extract_numeric,
Price = Price %>% extract_numeric,
Datetime =
Time %>%
stri_replace_all_fixed("a.m.", "am") %>%
stri_replace_all_fixed("p.m.", "pm") %>%
paste(Date, .) %>%
dmy_hm) %>%
select(-Date, -Time)

Resources