I am computing statistics (i.e., mean, max, median etc) for winter season comprised of Months 11 & 12 of previous year and Months 1-4 of following year.
mydate <- as.data.frame(seq(as.Date("2010-01-01"), to= as.Date("2019-12-31"), by="day"))
colnames(mydate) <- "Date"
DF <- data.frame(A = runif(3652,0,10),
J = runif(3652,0,8),
X = runif(3652,0,12),
Z = runif(3652,0,10),
mydate)
mydata <- DF %>% mutate(Year = year(Date), Month = month(Date)) %>%
pivot_longer(-c(Date,Year,Month), names_to = "variable", values_to = "values") %>%
filter(Month == 11 | Month == 12 | Month == 01 | Month == 02 | Month == 03 | Month == 04) %>%
mutate(W_Year = ifelse(Month > 10, Year+1, Year)) %>%
filter(W_Year != 2019) %>%
group_by(W_Year, variable) %>%
mutate(Cumulative = cumsum(values)) %>%
mutate(NewDate = ymd(paste("2020", Month, day(Date), sep = "-"))) %>%
ungroup() %>%
group_by(variable, NewDate) %>%
summarise(Median = median(Cumulative))
I would then need to combine mydata with the data.frame of accumulated values for the most recent year, which in my case is the year 2019 as an additional column.
X = c("A", "J","X", "Z")
Data2019 <- DF %>% mutate(Year = year(Date), Month = month(Date)) %>%
pivot_longer(-c(Date,Year,Month), names_to = "variable", values_to = "values") %>%
filter(between(Month,5,10)) %>%
filter(Year == 2019) %>%
group_by(Year, variable) %>%
mutate(Precipitation = cumsum(values)) %>%
mutate(NewDate = ymd(paste("2020", Month,day(Date), sep = "-"))) %>%
ungroup() %>%
group_by(variable, NewDate) %>%
select(c(4,6,7)) %>%
slice(match(X, variable))
While combining the two data.frame, i am getting mis-match error for number of rows- which i believe is due to leap year but do not know how to overcome this problem. Any way forward would help. Thank you,
Data_plot <- data.frame(mydata, Data2019[,2])
Related
I have a list of data frames int1 and int2. The end goal of this code is to assign the names to the elements in int1 and int2. The rest of the workflow for my work requires me to name the elements of the list multiple times, and I was wondering how I could create a function to reduce the number of keystrokes down the line using base r functions. Any ideas?
library(lubridate)
library(tidyverse)
library(purrr)
date <- rep_len(seq(dmy("01-01-2011"), dmy("31-07-2011"), by = "days"), 200)
ID <- rep(c("A","B", "C"), 200)
df <- data.frame(date = date,
x = runif(length(date), min = 60000, max = 80000),
y = runif(length(date), min = 800000, max = 900000),
ID)
df$Month <- month(df$date)
# Create first list
int1 <- df %>%
# arrange(ID) %>% # skipped for readability of result
mutate(new = floor_date(date, '10 day')) %>%
mutate(new = if_else(day(new) == 31, new - days(10), new)) %>%
group_by(ID, new) %>%
filter(Month == "1") %>%
group_split()
# Create second list
int2 <- df %>%
# arrange(ID) %>% # skipped for readability of result
mutate(new = floor_date(date, '10 day')) %>%
mutate(new = if_else(day(new) == 31, new - days(10), new)) %>%
group_by(ID, new) %>%
filter(Month == "2") %>%
group_split()
# Expected Output
# Assign names to int1
names(int1) <- sapply(int1, function(x) paste(x$ID[1],
x$new[1], sep = "_"))
# Assign names to int2
names(int2) <- sapply(int2, function(x) paste(x$ID[1],
x$new[1], sep = "_"))
Using group_split will not name the list elements. It is specified in the ?group_split
it does not name the elements of the list based on the grouping as this typically loses information and is confusing.
Instead use split from base R, which will return with the names pasteed using . from the 'ID', 'new' columns
int1 <- df %>%
# arrange(ID) %>% # skipped for readability of result
mutate(new = floor_date(date, '10 day')) %>%
mutate(new = if_else(day(new) == 31, new - days(10), new)) %>%
group_by(ID, new) %>%
filter(Month == "1") %>% ungroup %>%
{split(., .[c('ID', 'new')])}
Similarly for int2
The executable code below generates a scatter plot that depends on the date (date2) he chooses and three lines are also generated, referring to mean, mean+standard deviation and mean-standard deviation, which are based on the day of the week (Week) that is chosen.
As you can see, I used vector i to generate the mean and standard deviation. But I would like to optimize this, that is, when he chooses the date, he already understands what day of the week it is, so he doesn't need to use this i vector.
For example, I put it to generate scatterplot date 10/04/2021, so the code would need to know it's a Saturday, without having to set vector i to 3.
Can you help me with this question?
The link to download the database is:https://docs.google.com/spreadsheets/d/1W_hzuRq7D6X12BdwaXeM-cjg2A5MIKDx/edit?usp=sharing&ouid=102073768617937039119&rtpof=true&sd=true
library(dplyr)
library(ggplot2)
library(tidyr)
library(lubridate)
df<-read_excel('C:/Users/Downloads/database_test1.xlsx')
df<-subset(df,df$date2<df$date1)
dim_data<-dim(df)
day<-c(seq.Date(from = as.Date(df$date2[1]),
to = as.Date(df$date2[dim_data[1]]),
by = "1 day"))
df_grouped <- df %>%
mutate(across(starts_with("date"), as.Date)) %>%
group_by(date2) %>%
summarise(Id = first(Id),
date1 = first(date1),
Week = first(Week),
D = first(D),
D1 = sum(D1)) %>%
select(Id,date1,date2,Week,D,D1)
df_grouped <- df_grouped %>% mutate(date1=format(date1,"%d/%m/%Y"),
date2=format(date2,"%d/%m/%Y"))
df_grouped<-data.frame(df_grouped)
DS=c("Thursday","Friday","Saturday")
i<-3
df_OC<-subset(df_grouped,is.na(D))
ds_OC<-subset(df_OC,df_OC$Week==DS[i])
#Mean and Standard Deviation
mean_Week<-mean(as.numeric(ds_OC[,"D1"]) )
sdeviation_Week<-sd(as.numeric(ds_OC[,"D1"]))
#create scatter plot
scatter_date <- function(dt, dta = df) {
dta %>%
filter(date2 == ymd(dt)) %>%
summarize(across(starts_with("DR"), sum)) %>%
pivot_longer(everything(), names_pattern = "DR(.+)", values_to = "val") %>%
mutate(name = as.numeric(name)) %>%
plot(xlab = "Days", ylab = "Types", xlim = c(0, 7),
ylim = c((min(.$val) %/% 10) * 10, (max(.$val) %/% 10 + 1) * 15))
abline(h=mean_Week, col='blue')
abline(h=(mean_Week + sdeviation_Week), col='green',lty=2)
abline(h=(mean_Week - sdeviation_Week), col='orange',lty=2)
}
scatter_date("2021-04-10",df)
Generated images
You could create a lookup table:
library(tibble)
lookup <- df %>%
select(date2, Week) %>%
distinct() %>%
mutate(date2 = ymd(date2)) %>%
deframe()
lookup
#> 2021-03-04 2021-04-02 2021-04-03 2021-04-08 2021-04-09 2021-04-10
#> "Thursday" "Friday" "Saturday" "Thursday" "Friday" "Saturday"
So now
lookup["2021-04-10"]
#> "Saturday"
To use this with your scatterplot function you need to move some of your code into your function.
One more idea of optimization:
# You could put this lines into one pipe
df_grouped <- df %>%
mutate(across(starts_with("date"), as.Date)) %>%
group_by(date2) %>%
summarise(Id = first(Id),
date1 = first(date1),
Week = first(Week),
D = first(D),
D1 = sum(D1)) %>%
select(Id, date1, date2, Week, D, D1) %>%
mutate(date1 = format(date1, "%d/%m/%Y"),
date2 = format(date2, "%d/%m/%Y"))
# you don't need this line
# df_grouped<-data.frame(df_grouped)
Two more hints:
Use a space after ",". This makes the code easier to read.
Avoid using different types of quoting marks: use either " or ' not both (unless you have to use both).
According to https://stackoverflow.com/a/68948847/8282674 you can adapt your scatter_date with a switch statment and calculate every mean in there. The other way with less changes in your code, would be to remove DS=c("Thursday","Friday","Saturday") to calculate the weekday in the scatter_date function directly:
library(dplyr)
library(ggplot2)
library(tidyr)
library(lubridate)
df<-readxl::read_excel('C:/Users/Downloads/database_test1.xlsx')
df<-subset(df,df$date2<df$date1)
# translate the days
df %>% dplyr::mutate(Week = ifelse(Week=="Thursday", "quinta-feira", Week),
Week = ifelse(Week=="Friday", "sexta-feira", Week),
Week = ifelse(Week=="Saturday", "sábado", Week)) -> df
dim_data<-dim(df)
day<-c(seq.Date(from = as.Date(df$date2[1]),
to = as.Date(df$date2[dim_data[1]]),
by = "1 day"))
df_grouped <- df %>%
mutate(across(starts_with("date"), as.Date)) %>%
group_by(date2) %>%
summarise(Id = first(Id),
date1 = first(date1),
Week = first(Week),
D = first(D),
D1 = sum(D1)) %>%
select(Id,date1,date2,Week,D,D1)
df_grouped <- df_grouped %>% mutate(date1=format(date1,"%d/%m/%Y"),
date2=format(date2,"%d/%m/%Y"))
df_grouped<-data.frame(df_grouped)
#create scatter plot
scatter_date <- function(dt, dta = df) {
# get the week day
my_day <- weekdays(as.Date(dt))
df_OC<-subset(df_grouped,is.na(D))
ds_OC<-subset(df_OC,df_OC$Week==my_day) # omit 'i' and DS
mean_Week<-mean(as.numeric(ds_OC[,"D1"]) )
sdeviation_Week<-sd(as.numeric(ds_OC[,"D1"]))
mean_Week_pos <- (mean_Week + sdeviation_Week)
mean_Week_neg <- (mean_Week - sdeviation_Week)
dta %>%
filter(date2 == ymd(dt)) %>%
summarize(across(starts_with("DR"), sum)) %>%
pivot_longer(everything(), names_pattern = "DR(.+)", values_to = "val") %>%
mutate(name = as.numeric(name)) %>%
plot(xlab = "Days", ylab = "Types", xlim = c(0, 7),
main = paste0(my_day, ": (", mean_Week, ",+",mean_Week_pos, ",-", mean_Week_neg,")"),
ylim = c((min(.$val) %/% 10) * 10, (max(.$val) %/% 10 + 1) * 15))
abline(h=mean_Week, col='blue')
abline(h= mean_Week_pos, col='green',lty=2)
abline(h= mean_Week_neg, col='orange',lty=2)
}
scatter_date("2021-04-10",df)
scatter_date("2021-04-9",df)
scatter_date("2021-04-8",df)
I am trying to filter out a data set into two months. I would like to filter out the ID and year that have data, and to remove the ID and year that do not have an associated pair.
For example if an ID and year has both the January and July month in the data set, I would like to include this ID and the year in my filtered data. If an ID has only the month of January and not July, I would like to remove this data and not include it in the filtered data set. Is there a good way to do this? Just a note that I wasn't sure how to simulate the uneven data set in the example.
After filtering for my desired output, I test by creating a list for each seasonal month where each ID and year has at least 15 rows associated with it.
library(lubridate)
library(dplyr)
set.seed(12345)
df <- tibble(
date = sample(seq(dmy("01-01-2010"), dmy("31-12-2013"), by = "days"),
1000, replace = TRUE),
x = runif(length(date), min = 60000, max = 80000),
y = runif(length(date), min = 800000, max = 900000),
ID = rep(1:5, 200),
month = month(date),
year =year(date)) %>%
arrange(ID, date)
df %>%
filter(month %in% c(1,7)) %>%
group_by(ID, year) %>%
mutate(complete = length(unique(month)) == 2) %>%
group_by(ID) %>%
filter(all(complete)) %>%
group_by(ID, year)
# Creates a list for each year and by ID
summer_list <- df %>%
filter(month %in% 7) %>%
filter(n() >= 15) %>%
group_split(year, ID)
# Renames the names in the list to AnimalID and year
names(summer_list) <- sapply(summer_list,
function(x) paste(x$ID[1],
x$year[1], sep = '_'))
# Creates a list for each year and by ID
winter_list <- df1 %>%
filter(month %in% 1) %>%
filter(n() >= 15) %>%
group_split(year, ID)
# Renames the names in the list to ID and year
names(winter_list) <- sapply(winter_list,
function(x) paste(x$ID[1],
x$year[1], sep = '_'))
You were really close. I think your filter can be simplified to the following. Just be sure to save it to df.
df <- df %>%
filter(month %in% c(1,7)) %>%
group_by(ID, year) %>%
mutate(complete = length(unique(month)) == 2) %>%
filter(complete)
# could add "%>% select(-c(complete))" to get rid of complete
On summer_list and winter_list, add a group_by between the filters. With the dataset you provided, there were no groups with 15 records, but I tested that this works by bumping up the size of df until I got some.
summer_list <- df %>%
filter(month == 7) %>% # used == since there's only one test value
group_by(ID, year) %>% # added this
filter(n() >= 15) %>%
group_split()
There's also a typo in your first use of winter_list -- the input data is df1, but I think you want df. Hope this works!
Here's the complete code including the larger df:
library(lubridate)
library(dplyr)
set.seed(12345)
df <- tibble(
date = sample(seq(dmy("01-01-2010"), dmy("31-12-2013"), by = "days"),
4000, replace = TRUE),
x = runif(length(date), min = 60000, max = 80000),
y = runif(length(date), min = 800000, max = 900000),
ID = rep(1:5, 800),
month = month(date),
year =year(date)) %>%
arrange(ID, date)
df <- df %>%
filter(month %in% c(1,7)) %>%
group_by(ID, year) %>%
mutate(complete = length(unique(month)) == 2) %>%
filter(complete)
# could add "%>% select(-c(complete))" to get rid of complete
# Creates a list for each year and by ID
summer_list <- df %>%
filter(month == 7) %>%
group_by(ID, year) %>%
filter(n() >= 15) %>%
group_split()
# Renames the names in the list to AnimalID and year
names(summer_list) <- sapply(summer_list,
function(x) paste(x$ID[1],
x$year[1], sep = '_'))
# Creates a list for each year and by ID
winter_list <- df %>%
filter(month == 1) %>%
group_by(ID, year) %>%
filter(n() >= 15) %>%
group_split()
# Renames the names in the list to ID and year
names(winter_list) <- sapply(winter_list,
function(x) paste(x$ID[1],
x$year[1], sep = '_'))
I have multiple columns that has missing values. I want to use the mean of the same day across all years while filling the missing data for each column. for example, DF is my fake data where I see missing values for the two columns (A & X)
library(lubridate)
library(tidyverse)
library(naniar)
set.seed(123)
DF <- data.frame(Date = seq(as.Date("1985-01-01"), to = as.Date("1987-12-31"), by = "day"),
A = sample(1:10,1095, replace = T), X = sample(5:15,1095, replace = T)) %>%
replace_with_na(replace = list(A = 2, X = 5))
To fill in Column A, i use the following code
Fill_DF_A <- DF %>%
mutate(Year = year(Date), Month = month(Date), Day = day(Date)) %>%
group_by(Year, Day) %>%
mutate(A = ifelse(is.na(A), mean(A, na.rm=TRUE), A))
I have many columns in my data.frame and I would like to generalize this for all the columns to fill in the missing value?
We can use na.aggregate from zoo
library(dplyr)
library(zoo)
DF %>%
mutate(Year = year(Date), Month = month(Date), Day = day(Date)) %>%
group_by(Year, Day) %>%
mutate(across(A:X, na.aggregate))
Or if we prefer to use conditional statements
DF %>%
mutate(Year = year(Date), Month = month(Date), Day = day(Date)) %>%
group_by(Year, Day) %>%
mutate(across(A:X, ~ case_when(is.na(.)
~ mean(., na.rm = TRUE), TRUE ~ as.numeric(.))))
A day with precipitation >= 2.5 mm is called a rainy day. I could able to calculate monthwise rainy days using the following code
library(seas)
library(tidyverse)
library(zoo)
library(lubridate)
data(mscdata)
dat.int <- (mksub(mscdata, id=1108447))
dat.int %>%
as_tibble() %>% # for easier viewing
mutate(yearmon = as.yearmon(dat.int$date, "%b %y")) %>%
dplyr::select(-date, -year, -yday, -t_max, -t_min, -t_mean) %>%
pivot_longer(cols = -yearmon, names_to = "variable", values_to = "value") %>%
group_by(yearmon, variable) %>%
summarise(rainy_days = sum(value > 2.5)) %>%
pivot_wider(names_from = "variable", values_from = "rainy_days")
Then I have calculated the longterm average using the following code
dat.int %>%
as_tibble() %>% # for easier viewing
mutate(yearmon = as.yearmon(dat.int$date, "%b %y")) %>%
dplyr::select(-date, -year, -yday, -t_max, -t_min, -t_mean) %>%
pivot_longer(cols = -yearmon, names_to = "variable", values_to = "value") %>%
group_by(yearmon, variable) %>%
summarise(rainy_days = sum(value > 2.5)) %>%
mutate(year = year(yearmon)) %>%
group_by(variable) %>%
summarize(value = as.integer(round(mean(rainy_days, na.rm = T)))) %>%
pivot_wider(names_from = "variable", values_from = "value")
Now two thresholds should be calculated as: lower threshold = 0.81*long term average and upper threshold = 1.19*long term average. Then calculate the number of years having rainy days between these two thresholds. Now I want to calculate the number of years having rainy days in the range of 81–119% of long term average (between lower and upper threshold).
Edit: Based on OP's comments and wanting to summarize by total precip, rain and snow.
library(dplyr)
library(lubridate)
dat.int %>%
mutate(month = month(ymd(date))) %>%
group_by(year, month) %>%
summarize_at(vars(precip,rain,snow), funs(days = sum(. >= 2.5,na.rm = TRUE))) %>%
group_by(year) %>%
summarize_at(vars(ends_with("days")), funs(yearly = sum(.))) %>%
summarize_at(vars(-year), list(~ sum(. > mean(.) * 0.81 & . < mean(.) * 1.19))) %>%
rename_all(list(~ gsub("days_yearly","in_range",.))) summarize(years = n())
# precip_in_range rain_in_range snow_in_range
# <int> <int> <int>
#1 26 24 6