I would like to:
randomly assign an index date to people in the df_1 according to the distribution of the index date in the df_2;
the newly assigned index date should be earlier than the death date of people (the death date contains NA)
Currently I am using:
df_1 <-df_1 %>% mutate(index_date=sample(df_2$index_date, size=n(), replace=TRUE))
However, I do not know how to limit the index date before the death_date in df_1.
I'm guessing your data-format, but rowwise should do the trick:
df_1 <- df_1 %>%
rowwise() %>%
mutate(index_date = sample(df_2$index_date[df_2$index_date <= death_date], size=1)) %>%
ungroup()
Repex with mtcars:
mtcars %>%
rowwise() %>%
mutate(mpg_rand = sample(mtcars$mpg[mtcars$mpg <= mpg], 1))
Related
I have a longitudinal dataset in long format and I am want to create two variables, one where visit=1 value for a given ID is subtracted from all other values and one where visit=0 value for a given ID is subtracted from all other values. Sample dataset:
df<-data.frame(value=rnorm(30),
visit=rep(seq(0:9), 3), i
d=c(rep("A", 10), rep("B", 10), rep("C", 10)))
I know one method is to use the following code:
df %>% group_by(id) %>%
arrange(visit, .by_group=TRUE) %>%
mutate(value_chg_v1=value-nth(value, n=2),
value_chg_v0=value-nth(value, n=1))
but I want to construct the code so that if a entire row is missing for visit=0, value_chg_v0=NA. Is there a way to do this that does not involve adding in the rows where visit=0 is missing? Thanks in advance.
We could use complete to create the missing combinations
library(dplyr)
library(tidyr)
df %>%
complete(id, visit = 0:9) %>%
group_by(id) %>%
arrange(visit, .by_group=TRUE) %>%
mutate(value_chg_v1=value-nth(value, n=2),
value_chg_v0=value-nth(value, n=1)) %>%
ungroup %>%
drop_na(value)
A data wrangling question:
I have a dataframe of hourly animal tracking points with columns for id, time, and whether the animal is on land or in water (0 = water; 1 = land). It looks something like this:
set.seed(13)
n <- 100
dat <- data.frame(id = rep(1:5, each = 10),
datetime=seq(as.POSIXct("2020-12-26 00:00:00"), as.POSIXct("2020-12-30 3:00:00"), by = "hour"),
land = sample(0:1, n, replace = TRUE))
What I need to do is flag the first row after which the animal uses land at least once for 3 straight days. I tried doing something like this:
dat$ymd <- ymd(dat$datetime[1]) # make column for year-month-day
# add land points within each id group
land.pts <- dat %>%
group_by(id, ymd) %>%
arrange(id, datetime) %>%
drop_na(land) %>%
mutate(all.land = cumsum(land))
#flag days that have any land points
flag <- land.pts %>%
group_by(id, ymd) %>%
arrange(id, datetime) %>%
slice(n()) %>%
mutate(flag = if_else(all.land == 0,0,1))
# Combine flagged dataframe with full dataframe
comb <- left_join(land.pts, flag)
comb[is.na(comb)] <- 1
and then I tried this:
x = comb %>%
group_by(id) %>%
arrange(id, datetime) %>%
mutate(time.land=ifelse(land==0 | is.na(lag(land)) | lag(land)==0 | flag==0,
0,
difftime(datetime, lag(datetime), units="days")))
But I still can't quite wrap my head around what to do to make it so that I can figure out when the animal has been on land at least once for three days straight, and then flag that first point on land. Thanks so much for any help you can provide!
Create a date column from the timestamp. Summarise the data and keep only 1 row for each id and date which shows whether the animal was on land even once in the entire day.
Use zoo's rollapply function to mark the first day as TRUE if the next 3 days the animal was on land.
library(dplyr)
library(zoo)
dat <- dat %>% mutate(date = as.Date(datetime))
dat %>%
group_by(id, date) %>%
summarise(on_land = any(land == 1)) %>%
mutate(consec_three = rollapply(on_land, 3,all, align = 'left', fill = NA)) %>%
ungroup %>%
#If you want all the rows of the data
left_join(dat, by = c('id', 'date'))
I have 2 codes that manipulate and filter (by date) my data.frame and that work perfectly. Now I want to run the code for not only one day, but for every day in vector:
seq(from=as.Date('2020-03-02'), to=Sys.Date(),by='days')` #.... 538 days
The code I want to run for all the days between 2020-03-02 and today is:
KOKOKO <- data.frame %>%
filter(DATE < '2020-03-02')%>%
summarize(DATE = '2020-03-02', CZK = sum(Objem.v.CZK,na.rm = T)
STAVPTF <- data.frame %>%
filter (DATE < '2020-03-02')%>%
group_by(CP) %>%
summarize(mnozstvi = last(AKTUALNI_MNOZSTVI_AKCIE), DATE = '2020-03-02') %>%
select(DATE,CP,mnozstvi) %>%
rbind(KOKOKO)%>%
drop_na() %>%
So instead of '2020-03-02' I want to fill in all days since '2020-03-02' one after another. And each of the KOKOKO and STAVPTF created for the unique day like this I want to save as a separate data.frame and all of them store in a list.
We could use map to loop over the sequence and apply the code
library(dplyr)
library(purrr)
out <- map(s1, ~ data.frame %>%
filter(DATE < .x)%>%
summarize(DATE = .x, CZK = sum(Objem.v.CZK,na.rm = TRUE))
As this is repeated cycle, a function would make it cleaner
f1 <- function(dat, date_col, group_col, Objem_col, aktualni_col, date_val) {
filtered <- dat %>%
filter({{date_col}} < date_val)
KOKOKO <- filtered %>%
summarize({{date_col}} := date_val,
CZK = sum({{Objem_col}}, na.rm = TRUE)
STAVPTF <- filtered %>%
group_by({{group_col}}) %>%
summarize(mnozstvi = last({{aktualni_col}}),
{{date_col}} := date_val) %>%
select({{date_col}}, {{group_col}}, mnozstvi) %>%
bind_rows(KOKOKO)%>%
drop_na()
return(STAVPTF)
}
and call as
map(s1, ~ f1(data.frame, DATE, CP, Objem.v.CZK, AKTUALNI_MNOZSTVI_AKCIE, !!.x))
where
s1 <- seq(from=as.Date('2020-03-02'), to=Sys.Date(), by='days')
It would be easier to answer your question, if you would provide a minimal reproducible example. It's easy done with tidyverses reprex packages
However, your KOKOKO code can be rewritten as simple cumulative sum:
KOKOKO =
data.frame %>%
arrange(DATE) %>% # if necessary
group_by(DATE) %>%
summarise(CZK = sum(Objem.v.CZK), .groups = 'drop') %>% # summarise per DATE (if necessary)
mutate(CZK = cumsum(CZK) - CZK) # cumulative sum excluding current row (current DATE)
Even STAVPTF code can probably be rewritten without iterations. First find the last value of AKTUALNI_MNOZSTVI_AKCIE per CP and DATE. Then this value is assigned to the next DATE:
STAVPTF <-
data.frame %>%
group_by(CP, DATE) %>%
summarise(mnozstvi = last(AKTUALNI_MNOZSTVI_AKCIE), .groups='drop_last') %>%
arrange(DATE) %>% # if necessary
mutate(DATE = lead(DATE))
I am looking to figure out how to filter after grouping my data within summarise. I have 2 created columns below. I'd ideally like to filter the seasonTotal column within summarise to a value of greater than 3, and then calculate the homeRunsPerSeason based on that filtered count.
Reprex below:
library(Lahman)
library(tidyverse)
data <- Lahman::Batting
data <- data %>%
filter(yearID > 2015)
grouped_data <- data %>%
group_by(playerID) %>%
summarise(seasonTotal = n(),
homeRunsPerSeason = sum(HR / seasonTotal)
)
Separate each of the steps you want to accomplish. Calculate the season total, filter, then summarize.
grouped_data <- data %>%
group_by(playerID) %>%
mutate(seasonTotal = n()) %>%
filter(seasonTotal > 3) %>%
summarise(homeRunsPerSeason = sum(HR / seasonTotal))
I've got a data frame (df) with two variables, site and purchase.
I'd like to use dplyr() to group my data by site and purchase, and get the counts and percentages for the grouped data. I'd however also like the tibble to feature rows called ALLSITES, representing the data of all the sites grouped by purchase, so that I end up with a tibble looking similar to dfgoal.
The problem's that my current code doesn't get me the ALLSITES rows. I've tried adding a base R function into dplyr(), which doesn't work.
Any help would be much appreciated.
Starting point (df):
df <- data.frame(site=c("LON","MAD","PAR","MAD","PAR","MAD","PAR","MAD","PAR","LON","MAD","LON","MAD","MAD","MAD"),purchase=c("a1","a2","a1","a1","a1","a1","a1","a1","a1","a2","a1","a2","a1","a2","a1"))
Desired outcome:
dfgoal <- data.frame(site=c("LON","LON","MAD","MAD","PAR","ALLSITES","ALLSITES"),purchase=c("a1","a2","a1","a2","a1","a1","a2"),bin=c(1,2,6,2,4,11,4),pin_per=c(33.33333,66.66667,75.00000,25.00000,100.00000,73.33333,26.66666))
Current code:
library(dplyr)
df %>%
group_by(site, purchase) %>%
summarize(bin = sum(purchase==purchase)) %>%
group_by(site) %>%
mutate(bin_per = (bin/sum(bin)*100))
df %>%
rbind(df, transform(df, site = "ALLSITES") %>%
group_by(site, purchase) %>%
summarize(bin = sum(purchase==purchase)) %>%
group_by(site) %>%
mutate(bin_per = (bin/sum(bin)*100))
We can start from the first output code block, after grouping by 'site' with a created string of 'ALLSITES' and 'purchase' get the sum of 'bin' and later 'bin_per', then with bind_rows row bind the two datasets
df1 %>%
ungroup() %>%
group_by(site = 'ALLSITES', purchase) %>%
summarise(bin = sum(bin)) %>%
ungroup %>%
mutate(bin_per = 100*(bin/sum(bin))) %>%
bind_rows(df1, .)