Getting the last 6 months data in R - r

i had data frame which contain many row. Here it is:
library(lubridate)
date_ <- date(seq(ymd_h("2020-01-01-00"), ymd_h("2021-03-31-23"), by = "hours"))
hour_ <- hour(seq(ymd_h("2020-01-01-00"), ymd_h("2021-03-31-23"), by = "hours"))
game1 <- sort(round(runif(length(date_), 10, 50), 0), decreasing = TRUE)
game2 <- sort(round(runif(length(date_), 20, 100), 0), decreasing = TRUE)
game3 <- sort(round(runif(length(date_), 30, 150), 0), decreasing = TRUE)
game4 <- sort(round(runif(length(date_), 40, 200), 0), decreasing = TRUE)
game_data <- data.frame(date_, hour_, game1, game2, game3, game4)
I just want to subset game_data to get all the last 6 months data. How do i get it?

Last 6 months of data from the max date in the data ?
You can try -
library(lubridate)
library(dplyr)
result <- subset(game_data, date_ > max(date_) %m-% months(6))
Or with dplyr -
result <- game_data %>% filter(date_ > max(date_) %m-% months(6))

Related

Assigning factor labels and levels within a function

I have the following data frame:
library(janitor)
library(lubridate)
library(tidyverse)
data <- data.frame(date = c("1/28/2022", "1/25/2022", "1/27/2022", "1/23/2022"),
y = c(100, 25, 35, 45))
I need to write a function that adds a new column that sorts the date column and assigns sequential day stage (i.e., Day 1, Day 2, etc.). So far I have tried the following with no luck.
day.assign <- function(df){
df2 <- clean_names(df)
len <- length(unique(df2$date))
levels.start <- as.character(sort(mdy(unique(df2$date))))
day.label <- paste("Day", seq(1, len, by = 1))
df <-
df %>%
mutate(Date = as.character(mdy(Date)),
Day = as.factor(Date,
levels = levels.start,
labels = day.label))
}
Future files will have a various amount of dates that must be accounted for when assigning the day column (i.e., one file may have 4 dates while the next may have 6).
You could do:
library(lubridate)
library(dplyr)
data <- data.frame(date = c("1/28/2022", "1/25/2022", "1/27/2022", "1/23/2022"),
y = c(100, 25, 35, 45))
day.assign <- function(df) {
df %>%
mutate(Date = mdy(date)) %>%
arrange(mdy(date)) %>%
mutate(Day = paste0("Day ", row_number()))
}
day.assign(data)
#> date y Date Day
#> 1 1/23/2022 45 2022-01-23 Day 1
#> 2 1/25/2022 25 2022-01-25 Day 2
#> 3 1/27/2022 35 2022-01-27 Day 3
#> 4 1/28/2022 100 2022-01-28 Day 4

New column with random boolean values while controlling the ratio of TRUE/FALSE per category

In R I've got a dataset like this one:
df <- data.frame(
ID = c(1:30),
x1 = seq(0, 1, length.out = 30),
x2 = seq(100, 3000, length.out = 30),
category = gl(3, 10, labels = c("NEGATIVE", "NEUTRAL", "POSITIVE"))
)
Now I want to add a new column with randomized boolean values, but inside each category the proportion of TRUE and FALSE values should be the same (i.e. the randomizing process should generate the same count of true and false values, in the above data frame 5 TRUEs and 5 FALSEs in each of the 3 categories). How to do this?
You can sample a vector of "TRUE" and "FALSE" values without replacement so you have a randomized and balanced column in your data-frame.
sample(rep(c("TRUE","FALSE"),each=5),10,replace=FALSE)
Based on Yacine Hajji answer:
addRandomBool <- function(df, p){
n <- ceiling(nrow(df) * p)
df$bool <- sample(rep(c("TRUE","FALSE"), times = c(n, nrow(df) - n)))
df
}
Reduce(rbind, lapply(split(df, df$category), addRandomBool, p = 0.5))
where parametar p determines the proportion of TRUE.
This will sample within each group from a vector of 5 TRUE and 5 FALSE without replacement. It will assume that there are always 10 records per group.
library(dplyr)
library(tidyr)
df <- data.frame(
ID = c(1:30),
x1 = seq(0, 1, length.out = 30),
x2 = seq(100, 3000, length.out = 30),
category = gl(3, 10, labels = c("NEGATIVE", "NEUTRAL", "POSITIVE"))
)
set.seed(pi)
df %>%
group_by(category) %>%
nest() %>%
mutate(data = lapply(data,
function(df){ # Function to saple and assign the new_col
df$new_col <- sample(rep(c(FALSE, TRUE),
each = 5),
size = 10,
replace = FALSE)
df
})) %>%
unnest(cols = "data")
This next example is a little more generalized, but still assumes (approximately) even distribution of TRUE and FALSE within a group. But it can accomodate variable group sizes, and even groups with odd numbers of records (but will favor FALSE for odd numbers of records)
library(dplyr)
library(tidyr)
df <- data.frame(
ID = c(1:30),
x1 = seq(0, 1, length.out = 30),
x2 = seq(100, 3000, length.out = 30),
category = gl(3, 10, labels = c("NEGATIVE", "NEUTRAL", "POSITIVE"))
)
set.seed(pi)
df %>%
group_by(category) %>%
nest() %>%
mutate(data = lapply(data,
function(df){
df$new_col <- sample(rep(c(FALSE, TRUE),
length.out = nrow(df)),
size = nrow(df),
replace = FALSE)
df
})) %>%
unnest(cols = "data")
Maintaining Column Order
A couple of options to maintain the column order:
First, you can save the column order before you do your group_by - nest, and then use select to set the order when you're done.
set.seed(pi)
orig_col <- names(df) # original column order
df %>%
group_by(category) %>%
nest() %>%
mutate(data = lapply(data,
function(df){
df$new_col <- sample(rep(c(FALSE, TRUE),
length.out = nrow(df)),
size = nrow(df),
replace = FALSE)
df
})) %>%
unnest(cols = "data") %>%
select_at(c(orig_col, "new_col")) # Restore the column order
Or you can use a base R solution that doesn't change the column order in the first place
df <- split(df, df["category"])
df <- lapply(df,
function(df){
df$new_col <- sample(rep(c(FALSE, TRUE),
length.out = nrow(df)),
size = nrow(df),
replace = FALSE)
df
})
do.call("rbind", c(df, list(make.row.names = FALSE)))
There are likely a dozen other ways to do this, and probably more efficient ways that I'm not thinking of.

Split amount pro rata monthly

Hi ladies and gentlemen,
here you see my input file:
Input <- data.table(
id = c("x1","x2","x3"),
from = c("2020-01-01","2020-02-15","2019-01-15"),
to = c("2020-12-31","2021-02-14","2021-02-14"),
Amount = c(120, 120, 240)
)
basically I would like to split the amount on a monthly basis. Can you suggest a fast and elegant way to do it?
Here what I would like to get for one of the id (for the other id the logic should be the same).
Output_x2 <- data.table(
id = c("x2"),
Period = c(2021),
to = c(202002, 202003, 202004, 202005, 202006, 202007, 202008, 202009, 202010, 202011, 202012, 202101, 202102),
Amount_ProRata = c(5, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,5)
)
A possible solution:
library(data.table)
library(tidyverse)
library(lubridate)
spread <- function(x,y,z,value)
{
nMonths <- as.duration(y %--% z) %>%
as.numeric("months") %>%
round(0)
frac1 <- y %>%
{day(.)/days_in_month(.)} %>%
round(1)
frac2 <- z %>%
{day(.)/days_in_month(.)} %>%
round(1)
wholem <- round(nMonths - frac1 - frac2,0)
splita <- (value/nMonths) * c(frac1,rep(1,wholem),frac2)
splita <- splita[splita > 0]
months <- seq(ymd(y),ymd(z)+2, by = "months")
months <- months[1:length(splita)]
df <- data.frame(id=x,
Period=year(months),
to=paste0(year(months),sprintf("%02d",month(months))),
Amount_ProRata=splita)
}
map_dfr(1:nrow(Input),
~ spread(Input$id[.x], Input$from[.x], Input$to[.x], Input$Amount[.x]))

10 Day intervals with a overlapping date

I have a annual data set that I would like to break into 10 day intervals. For example I would like to subset 2010-12-26 to 2011-01-04 create a home range using the x and y values for those dates, then get the next 9 days plus an overlapping date between the subsetted data in this case it would be 2011-01-04 (2011-01-04 to 2011-01-13). Is there a good way to do this?
library(lubridate)
date <- rep_len(seq(dmy("26-12-2010"), dmy("20-01-2011"), by = "days"), 500)
df <- data.frame(date = date,
x = runif(length(date), min = 60000, max = 80000),
y = runif(length(date), min = 800000, max = 900000))
I separated them by 10 observations, but I am unsure as to how I can do make it more specific to get 10 days instead of just 10 observations.
interval_10 <- lapply(
seq(0, nrow(df), by = 10),
function(k) df[max(k+1, 1):min(k + 10, nrow(df)), ]
)
Thank you
lapply through the unique date vector will do the work:
t <- unique(date)[seq(from = 1, to = length(unique(date)), by = 9)]
interval_10 <- lapply(
1:(length(t)-1),
function(k) df %>% filter(date <= t[k+1], date >= t[k])
)

Precipitation values for every 5 minutes to hourly summaries in R

I am trying to get the total precipitation values for every hour from a personal weather station I have using the weatherData package. The problem I have is that the data is collected every five minutes and the values repeat themselves until there is a change in precipitation value. I have tried the 'duplicated' function but I get a large number of data removed when there is no precipitation which makes it hard for me to get a summary of the hourly precipitation.
Please see code below
## Load required libraries
library(weatherData)
library(ggplot2)
library(scales)
library(plyr)
library(reshape2)
library(gridExtra)
library(lubridate)
library(weathermetrics)
library(zoo)
# Get data for PWS using weatherData package
pws <- getWeatherForDate("IPENANGB2", "2014-09-01","2014-09-30", station_type = "id",opt_detailed=T, opt_custom_columns=T, custom_columns=c(1,2,6,7,10))
# Rename columns
colnames(pws)<-c("time","time1","tempc","wdd","wspd","prcp")
## Adding date columns
pws$time<-as.POSIXct(pws$time1,format="%Y-%m-%d %H:%M:%S",tz="Australia/Perth")
pws$year <- as.numeric(format(pws$time,"%Y"))
pws$date <-as.Date(pws$time,format="%Y-%m-%d",tz="Australia/Perth")
pws$year <- as.numeric(as.POSIXlt(pws$date)$year+1900)
pws$month <- as.numeric(as.POSIXlt(pws$date)$mon+1)
pws$monthf <- factor(pws$month,levels=as.character(1:12),labels=c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"),ordered=TRUE)
pws$weekday <- as.POSIXlt(pws$date)$wday
pws$weekdayf <- factor(pws$weekday,levels=rev(0:6),labels=rev(c("Mon","Tue","Wed","Thu","Fri","Sat","Sun")),ordered=TRUE)
pws$yearmonth <- as.yearmon(pws$date)
pws$yearmonthf <- factor(pws$yearmonth)
pws$week <- as.numeric(format(as.Date(pws$date),"%W"))
pws$weekf<- factor(pws$week)
pws$jday<-yday(pws$date)
pws$hour <- as.numeric(format(strptime(pws$time, format = "%Y-%m-%d %H:%M"),format = "%H"))
pws$min <- as.numeric(format(strptime(pws$time, format = "%Y-%m-%d %H:%M"),format = "%M"))
# Remove duplicate values
pws.df <- pws[!duplicated(pws$prcp),]
Assuming you want to get hourly averages of tempc, wdd, wspd, prcp:
# used packages
library(weatherData)
library(lubridate)
library(dplyr)
library(stringr)
# read data
pws <- getWeatherForDate("IPENANGB2",
"2014-09-01",
"2014-09-30",
station_type = "id",
opt_detailed = T,
opt_custom_columns = T,
custom_columns = c(1, 2, 6, 7, 10))
# rename columns
colnames(pws) <- c("time", "time1", "tempc", "wdd", "wspd", "prcp")
# cleaning dataset and adding some columns
useful_pws <-
pws %>%
select(2:6) %>%
filter(!str_detect(time1, "<br>")) %>%
mutate(time1 = ymd_hms(time1),
year = year(time1),
month = month(time1),
day = day(time1),
hour = hour(time1)) %>%
tbl_df()
# summarising dataset
useful_pws %>%
select(-time1) %>%
group_by(year, month, day, hour) %>%
summarise(tempc = mean(tempc, na.rm = TRUE),
wdd = mean(wdd, na.rm = TRUE),
wspd = mean(wspd, na.rm = TRUE),
prcp = mean(prcp, na.rm = TRUE))

Resources