Rollmean over one month with different groups in R - r

I have following extract of my dataset:
library(dyplr)
library(runner)
example <- data.frame(Date <- c("2020-03-24", "2020-04-06" ,"2020-04-08" ,
"2020-04-13", "2020-04-14", "2020-04-15",
"2020-04-16", "2020-04-18", "2020-04-23",
"2020-04-24", "2020-04-26", "2020-04-29",
"2020-03-24", "2020-04-06" ,"2020-04-08" ,
"2020-04-01", "2020-04-12", "2020-04-15",
"2020-04-17", "2020-04-18", "2020-04-22",
"2020-05-01", "2020-05-15", "2020-05-29",
"2020-03-08", "2020-04-06" ,"2020-04-15",
"2020-04-22", "2020-04-28", "2020-05-05",
"2020-05-08", "2020-05-22", "2020-05-23"),
username <- c("steves_" ,"steves_" ,"steves_",
"steves_" ,"steves_" ,"steves_",
"steves_" ,"steves_" ,"steves_",
"steves_" ,"steves_" ,"steves_",
"jules_" ,"jules_" ,"jules_",
"jules_" ,"jules_" ,"jules_",
"jules_" ,"jules_" ,"jules_",
"jules_" ,"jules_" ,"jules_",
"mia" ,"mia" ,"mia",
"mia" ,"mia" ,"mia",
"mia" ,"mia" ,"mia"),
ER <- as.numeric(c("0.092", "0.08", "0.028",
"0.1", "0.09", "0.02",
"0.02", "0.8", "0.001",
"0.001", "0.1", "0.098",
"0.001", "0.002","0.02",
"0.0098", "0.002","0.0019",
"0.002", "0.11","0.002",
"0.02", "0.01", "0.009",
"0.19", "0.09", "0.21",
"0.22", "0.19", "0.22",
"0.09", "0.19", "0.28")))
colnames(example) <- c("Date", "username", "ER")
example$Date <- as.Date(example$Date)
str(example)
I would like to calculate the respective average of the ER over a month from the respective dates.
I know that there are similar contributions to this already in the forum - but unfortunately I could not find the solution for me.
I have tried the following solutions:
example$avgER_30days <- example %>%
arrange(username, Date) %>%
group_by(username) %>%
mutate(rollmean(example$ER, Date > (Date %m-% months(1)) & Date < Date, fill = NA))
or with the package runners
example$average <- example %>%
group_by(username) %>%
arrange(username, Date) %>%
mutate(mean_run(x = example$ER, k = 30, lag = 1, idx=example$Date)) %>%
ungroup(username)
I would be happy if you could help me!

Here are two equivalent alternatives.
In the first alternative below, the second argument to rollapplyr is a list such that the ith component is the vector of offsets to average over for the ith row of the group.
In the second alternative we can specify the width as a vector of widths, one per row, and then when taking the mean eliminate the last value.
Note that w is slightly different in the two alternatives.
Review ?rollapply for details on the arguments and for further examples.
library(dplyr, exclude = c("filter", "lag"))
library(zoo)
example %>%
arrange(username, Date) %>%
group_by(username) %>%
mutate(w = seq_along(Date) - findInterval(Date - 30, Date) - 1,
avg30 = rollapplyr(ER, lapply(-w, seq, to = -1), mean, fill=NA)) %>%
ungroup
example %>%
arrange(username, Date) %>%
group_by(username) %>%
mutate(w = seq_along(Date) - findInterval(Date - 30, Date),
avg30 = rollapplyr(ER, w, function(x) mean(head(x, -1)), fill = NA)) %>%
ungroup

Related

loading combined data in R

I have got several Excel files (see link: we.tl/t-qJl3kVcY0j) that I combine together.
Each Excel file have columns from A to AF as below:
t-ph
Load
HR
BF
V'E
V'O2
V'CO2
d O2/dW
RER
EqO2
EqCO2
PETCO2
VES (ml)
VESi (ml/m²)
FC (bpm)
QC (l/min)
IC (l/min/m²)
PAS (mmHg)
PAD (mmHg)
PAM (mmHg)
ICT
TEV (ms)
RPD (%)
WCI (kg.m/m²)
RVSi (dyn.s/cm5.m²)
RVS (dyn.s/cm5)
VTD est (ml)
FE est (%)
O2Hb
HHb
tHb
HbDiff
My previous code was working, but since I have added columns AC (HHb) to AF (HbDiff), I can't load them anymore. I have tried to match the number of columns, the title, but still not. I can't produce a reproducible example since I can't load my data.
Here is the code I used:
pacman::p_load(tidyverse, readxl, ggpubr)
library(dplyr)
library(ggplot2)
library(afex) ##statistic package
# load data and format
load_files <- function(files){
temp <- read_excel(files) %>%
select(-(c(8:11, 14:15, 18:23))) %>%
mutate(id = pull(.[4,1])) %>% ##ID
mutate(body_mass = pull(.[7,3])) %>% ##body mass
mutate(training = pull(.[4,2])) %>% ##training group
set_names(c("time", "power", "hr", "fr", "VE", "absVO2", "VCO2", "PETCO2", "VES", "QC", "IC", "WCI", "RVSi", "RVS", "VTD", "FE", "O2Hb", "HHb", "tHb", "HbDiff", "id", "body_mass", "training")) %>%
slice(86:which(grepl("ration", VE))-1) %>% ##until recovery period
mutate_at(vars(1:16), as.numeric) %>%
mutate_at(vars(18), as.numeric) %>%
mutate(time = format(as.POSIXct(Sys.Date() + time), "%H:%M", tz="UTC"),
absVO2 = absVO2/1000,
VCO2 = VCO2/1000)
}
# apply function to all files
df <- map_df(file_list, load_files)
# remove those with who have less than four similar power
df <- df %>%
mutate(len_seq = rep(rle(power)$lengths, rle(power)$lengths)) %>%
filter(len_seq == 4) %>%
mutate(seq_id = rep(1:(n()/4), each = 4)) %>%
group_by(id) %>%
select(-seq_id)%>%
select(-(20))
# group data
df_sum <- df %>%
type.convert(as.is = TRUE) %>%
group_by(id, power, training) %>%
summarise_if(is.numeric, mean) %>%
group_by(id) %>%
mutate(percent_absVO2 = absVO2/max(absVO2)*100,
percent_power = power/max(power)*100,
percent_QC = QC/max(QC)*100,
percent_SV = VES/max(VES)*100,
percent_VCO2 = VCO2/max(VCO2)*100,
percent_VE = VE/max(VE)*100) %>%
mutate(VE_VO2 = VE/absVO2,
VE_VCO2 = VE/VCO2) %>%
mutate(RER = VCO2/absVO2, VT = VE/fr) %>%
mutate(relVO2 = absVO2/body_mass*1000,
percent_relVO2 = relVO2/max(relVO2)*100) %>%
mutate(BF = VE/VT) %>%
mutate(mech_perf = (power/(((0.003*power+0.1208)*1000*body_mass)/60))*100) %>%
mutate(group = ifelse(grepl(".*-PRD-C", id), "CAD", "Healthy")) %>%
mutate(temps = ifelse(grepl(".*-PRD-C1", id), "1", ifelse(grepl(".*-PRD-S1", id), "1", "2")))
Then the outcome I get:
Error in `set_names()`:
! The size of `nm` (23) must be compatible with the size of `x` (20).
Run `rlang::last_error()` to see where the error occurred.
Thank you very much for your precious help.

Grouping By Multiple Selection Answer for Likert Package

I wanted to create a likert graph that is grouped by Question i. I can create the likert graph for total responses ungrouped, but im uncertain of how to reformat question 6 without losing the column for question i. (aka do the reformatting done below but also have it take into account who selected what in question i.)
What I want is the sufficiency of Q6 grouped by their answer in question i.
Sample Dataframe:
SurveyClean2 <- data.frame(i = c("Mail,Email", "Mail", "Mail,Email,Podcast", "Radio,Podcast", "Radio", "Mail,Radio"), Q6_3 = c("Not Sufficient", "Very Sufficient", "Completely Sufficient", "Moderately Sufficient", "Moderately Sufficient", "Not Sufficient"))
Unnesting Question i:
UnnestQi <- SurveyClean2 %>%
as_tibble() %>%
mutate(i = str_split(Q3, ",")) %>%
unnest(i)
Survey2Q6 <- UnnestQi |> drop_na(Q5) |> drop_na(i)
Reformating Question 6 to Likert-friendly format:
clean_survey <- function(data, column, question) {
data %>%
dplyr::select(all_of({{column}})) %>%
dplyr::mutate(Question = question) %>%
dplyr::group_by(Question, across(1)) %>%
dplyr::count() %>%
dplyr::ungroup() %>%
tidyr::pivot_wider(names_from = 2, values_from = n)
}
# table that contains survey questions/columns and the question name
survey_table <- dplyr::tibble(
column = c("Q6_3"),
question = c("Expert advice")
)
# loop through your data and clean it, then bind as dataframe
LikertGroupqi62 <- purrr::map2_df(survey_table$column, survey_table$question, function(x, y){
clean_survey(Survey2Q6, x, y)}) |>
mutate(across(everything(), ~ifelse(is.na(.), 0, .)))
## Likert
LikertGroupqi62 <- LikertGroupqi62 |> dplyr::select(Question, `Not Sufficient`, `Slightly Sufficient`, `Moderately Sufficient`, `Very Sufficient`, `Completely Sufficient`)
Likert WITHOUT grouping:
likert(Question~., LikertGroupqi62, ReferenceZero = 0, auto.key.in = list(columns = 1), main = list("Sufficiency of Cost-share Advice Based on Person or Agency Worked With"), col = c("#db6d00", "#924900", "#000000", "#004949", "#009292"),strip.left = FALSE, ylab = "", xlab = "Total Number of Respondents")

x axis as week number and secondary x-axis as date

I'm trying to plot a time serie with Primary x-axis as numeric and Secondary x-axis as date in ggplot. This is my poor try.
library(tidyverse)
tibble::tribble(
~date, ~ndvi,
"2020-05-18", 0.7655,
"2020-06-14", 0.723,
"2020-07-12", 0.6178,
"2020-08-21", 0.437,
"2020-09-07", 0.4763,
"2020-09-10", 0.4928,
"2020-09-12", 0.4831,
"2020-09-22", 0.4774,
"2020-10-02", 0.5794,
"2020-10-07", 0.606
) %>%
mutate(date = lubridate::ymd(date),
weeks = difftime(date, min(date), units="weeks")) %>%
ggplot(aes(weeks, ndvi)) +
geom_line()+
scale_x_datetime(
sec.axis = dup_axis(name = "", #breaks = date,
labels = scales::time_format("%b")))
#> Error: Invalid input: time_trans works with objects of class POSIXct only
And this is the desired output, where the Primary x-axis has weeks and Secondary x-axis has the months of the time serie
set labels for both x-axes separately. Please check also this discussion
library(tidyverse)
mydat <- tibble::tribble(
~date, ~ndvi,
"2020-05-18", 0.7655,
"2020-06-14", 0.723,
"2020-07-12", 0.6178,
"2020-08-21", 0.437,
"2020-09-07", 0.4763,
"2020-09-10", 0.4928,
"2020-09-12", 0.4831,
"2020-09-22", 0.4774,
"2020-10-02", 0.5794,
"2020-10-07", 0.606
) %>%
mutate(date = lubridate::ymd(date),
weeks = as.numeric(difftime(date, min(date), units="weeks")))
mydat %>%
ggplot(aes(date, ndvi)) +
geom_line() +
scale_x_date(date_breaks = "4 weeks", labels = scales::date_format("%W"),
sec.axis = dup_axis(name = "", labels = scales::date_format("%b")))
Created on 2021-02-10 by the reprex package (v1.0.0)

how to further refine expss table format?

I am trying to improve my table design using expss. My current design is shown below using the following code:
library(expss)
# bogus example data
x<-structure(list(visits= structure(c(17, 2, 23, 1, 21), label = "Total # Home Visits", class = c("labelled", "numeric")), months_enrolled = structure(c(21.42474, 51.105, 52.474, 53.75, 60.0392105), label = "Enrollment Duration (months)", class =c("labelled","numeric")), marital2 = structure(c("Married", NA, "Married", "Married", "Married"), label = "Marital Status", class = c("labelled", "character")), Relationship2 = structure(c("Mother", "Mother", "Mother", "Mother", "Mother"), label = "Relationship (recoded)", class = c("labelled", "character"))), row.names = c(NA, 5L), class = "data.frame")
htmlTable(x %>%
tab_cells(visits,months_enrolled) %>%
tab_rows(marital2, Relationship2, total()) %>% tab_stat_fun(Mean = w_mean, "Valid N" = w_n, method = list) %>%
tab_pivot() %>%
set_caption("Table 6: Bogus Visits and Duration by Characteristics") %>%
htmlTable(.,css.cell = c("width: 220px", # first column width
rep("width: 50px", ncol(.) - 1))))
I'd like to improve the table design by placing the mean statistics for Home Visits and Enrollment Duration as columns, thus saving a row for each level of Marital Status (and other vars in tab_rows). How is this achieved? Also, is it possible to shade alternate rows?
It seems, the simplest way is to transpose table:
htmlTable(x %>%
tab_cells(visits, months_enrolled) %>%
tab_cols(marital2, Relationship2, total()) %>%
tab_rows(total(label = "|")) %>%
tab_stat_fun(Mean = w_mean, "Valid N" = w_n) %>%
tab_pivot() %>%
tab_transpose() %>%
set_caption("Table 6: Bogus Visits and Duration by Characteristics") %>%
htmlTable(.,css.cell = c("width: 220px", # first column width
rep("width: 50px", ncol(.) - 1))))

R replace_na values conditionally by column with multiple conditions

My question is similar to other replace_na posts but I can't find the right combination of answers.
I have a dataframe with inflation rates for all countries over 8 years (wide format - countries as rows and years as columns).
Some countries have NAs for all 8 years (columns 3:10), and in that case I want to replace all NAs with the column mean
library(tidyverse)
sample %>%
mutate_if((rowSums(is.na[,3:10]))!=8, replace_na = colMeans(na.rm=T))
This is close but something is wrong.
Other countries only have NAs in some columns, in which case I want to replace NA with the previous year's value
library(zoo)
sample %>%
mutate_if((rowSums(is.na[,3:10]))!=8, replace_na = colMeans(na.rm=T)),
is.na[,4:10], na.locf(fromLast = TRUE)))
Tried using na.locf from the zoo package but can't get it right with the other conditions
The final condition is that, if the NA is in the first year (2007), I want to replace it with the 2007 column mean instead of the next year (2008 was the financial crisis so all the inflation rates went nuts).
mutate_if((rowSums(is.na[,3:10]))!=8, replace_na = colMeans(na.rm=T)),
is.na[,4:10], na.locf(fromLast = TRUE)),
is.na("2007"), replace = colMeans("2007", na.rm = TRUE))
But this is full of errors and I'm stuck trying to link all these conditions together - pretty new to ifelse statements. I'm trying to find a dplyr solution as that's the syntax I'm most familiar with, but maybe it's easier in base R or data.table
running R 3.6.1
sample <- structure(list(`Country Name` = c("Aruba", "Afghanistan", "Angola",
"Albania", "Andorra", "Arab World", "United Arab Emirates", "Argentina",
"Armenia", "American Samoa", "Antigua and Barbuda", "Australia"
), `Country Code` = c("ABW", "AFG", "AGO", "ALB", "AND", "ARB",
"ARE", "ARG", "ARM", "ASM", "ATG", "AUS"), `2007` = c(5.39162036843645,
8.68057078513406, 12.2514974459487, 2.93268248162318, NA, 4.74356585295154,
NA, NA, 4.40736089644519, NA, 1.41605259409743, 2.32761128891476
), `2008` = c(8.95722105296535, 26.4186641547444, 12.4758291326398,
3.36313757366391, NA, 11.2706652380848, 12.2504202448139, NA,
8.94995335353386, NA, 5.33380639820232, 4.35029854990047), `2009` = c(-2.13630037272305,
-6.81116108898995, 13.7302839288409, 2.23139683475865, NA, 2.92089711805365,
1.55980098148558, NA, 3.40676682683799, NA, -0.550159995508869,
1.77111716621252), `2010` = c(2.07773902027782, 2.1785375238942,
14.4696564932574, 3.61538461538463, NA, 3.91106195534027, 0.879216764156813,
NA, 8.17636138473956, NA, 3.3700254022015, 2.91834002677376),
`2011` = c(4.31633194082721, 11.8041858089129, 13.4824679218511,
3.44283593170005, NA, 4.75316388885632, 0.877346595685083,
NA, 7.6500080785929, NA, 3.45674967234599, 3.30385015608744
), `2012` = c(0.627927921638161, 6.44121280934118, 10.2779049218839,
2.03642235579081, NA, 4.61184432206646, 0.662268900269082,
NA, 2.55802007757907, NA, 3.37688044338879, 1.76278015613193
), `2013` = c(-2.37226328015073, 7.38577178397857, 8.77781429332619,
1.92544399507649, NA, 3.23423783752364, 1.10111836375706,
NA, 5.78966778544654, NA, 1.05949782356168, 2.44988864142539
), `2014` = c(0.421637771012246, 4.67399603536339, 7.28038730361125,
1.61304235314414, NA, 2.77261158414198, 2.34626865671643,
NA, 2.98130868933673, NA, 1.08944157435363, 2.48792270531403
)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-12L))
First compute a logical vector, all.na having one component per row which is TRUE if that row's numeric data is all NAs and FALSE otherwise. Then use na.aggregate to fill in all-NA rows. Also use na.aggregate on 2007. Then convert to long form and apply na.locf0 by country and convert back to wide form.
library(dplyr)
library(tidyr)
library(zoo)
all.na <- sample %>%
select_if(is.numeric) %>%
{ rowSums(is.na(.)) == ncol(.) }
sample %>%
mutate_at(-(1:3), ~ if_else(all.na, na.aggregate(.x), .x)) %>%
mutate(`2007` = na.aggregate(`2007`)) %>%
gather(key, value, -`Country Name`, -`Country Code`) %>%
group_by(`Country Name`, `Country Code`) %>%
mutate(value = na.locf0(value)) %>%
ungroup %>%
spread(key, value)
or using only zoo:
library(zoo)
all.na <- apply(is.na(sample[grep("^2", names(sample))]), 1, all)
ix <- -(1:3)
sample.out <- sample
Fill <- function(x) ifelse(all.na, na.aggregate(x), x)
sample.out[ix] <- lapply(sample[ix], Fill)
sample.out$"2007" <- na.aggregate(sample.out$"2007")
sample.out[ix] <- t(apply(sample.out[ix], 1, na.locf0))
Tried to do the same by only using the tidyverse (dplyr and tidyr), this is what I got:
Created the Replacement according to condition 1. Column Means.
# Getting the Column Means to Replace according to Condition 1 and 3.
replacement <- sample %>%
select_if(is.numeric) %>%
summarize_all( ~ mean(., na.rm = TRUE)) %>%
#Transformed to List since it is a requirement for tidyr::replace_na()
as.list()
Then I created everything into just one pipeline.
sample %>%
pivot_longer(`2007`:`2014`, names_to = "year", values_to = "int_rate") %>%
group_by(`Country Name`) %>%
summarize(na_num = is.na(int_rate) %>% sum) %>%
#Joining the number of NAs na_num as a new column
left_join(sample, by = "Country Name") %>%
#Replacing 2007 missing as a first value. Condition 3.
mutate(`2007` = if_else(between(na_num, 1, 7) &
is.na(`2007`), replacement[[1]] , `2007`)) %>%
#Making dataset wider
pivot_longer(`2007`:`2014`, names_to = "year", values_to = "int_rate") %>%
group_by(`Country Name`) %>%
#Using fill to impute NAs with the previous one. Condition 2.
fill(int_rate) %>%
pivot_wider(names_from = year, values_from = int_rate) %>%
#Replacing Values when all values are missing. Condition 1.
replace_na(replace = replacement)

Resources