r arrange data nested wide format - r

I have a dataset like this
Time1 Time2 Time3
A
Median 0.046 0.12 0
Q1, Q3 -0.12, 0.22 -1.67, -4.59 -0.245, 0.289
Range -2.75 -4.65 -2.20 - 1.425 -3.12, -1.928
B
Median 0.016 0.42 0.067
Q1, Q3 -0.21, 0.63 -1.17, -2.98 -0.478, 0.187
Range -2.15 -2.15 -1.12 - 1.125 -1.45, -1.478
What I want is to make this look like this
Time1 Time2 Time3
Median Q1,Q3 Range Median Q1,Q3 Range Median Q1,Q3 Range
A 0.046 -0.12, 0.22 2.75 -4.65 0.12 -1.67, -4.59 -2.20 - 1.425 0 -0.245, 0.289 -3.12, -1.928
B 0.016 -0.21, 0.63 -2.15 -2.15 0.42 -1.17, -2.98 -1.12 - 1.125 0.067 -0.478, 0.187 -1.45, -1.478
I have used spread function before to change long to wide, not sure how to turn this into a nested wide. Any suggestions is much appreciated.
df <- structure(list(Col1 = c("A", "Median", "Q1, Q3", "Range", "B",
"Median", "Q1, Q3", "Range"), Time1 = c("", "0.046", "-0.12, 0.22",
"-2.75 -4.65", "", "0.016", "-0.21, 0.63", "-2.15 -2.15"), Time2 = c("",
"0.12", "-1.67, -4.59", "-2.20 - 1.425", "", "0.42", "-1.17, -2.98",
"-1.12 - 1.125"), Time3 = c("", "0 ", "-0.245, 0.289 ",
"-3.12, -1.928", "", "0.067 ", "-0.478, 0.187 ", "-1.45, -1.478"
)), class = "data.frame", row.names = c(NA, -8L))

Here is a potential solution, see comments for the step by step.
library(tidyr)
#find rows containing the ids
namerows <- which(df$Time1=="")
#create and fill in the id column
df$id <- ifelse(df$Time1=="", df$Col1, NA)
df <- fill(df, id, .direction="down")
#clean up the dataframe
df <- df[-namerows, ]
#pivot
pivot_wider(df, id_cols = "id", names_from = "Col1", values_from = starts_with("Time"))
The result:
# A tibble: 2 × 10
id Time1_Median `Time1_Q1, Q3` Time1_Range Time2_Median `Time2_Q1, Q3` Time2_Range Time3_Median `Time3_Q1, Q3` Time3_Range
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 A 0.046 -0.12, 0.22 -2.75 -4.65 0.12 -1.67, -4.59 -2.20 - 1.425 "0 " "-0.245, 0.289 " -3.12, -1.928
2 B 0.016 -0.21, 0.63 -2.15 -2.15 0.42 -1.17, -2.98 -1.12 - 1.125 "0.067 " "-0.478, 0.187 " -1.45, -1.478

Related

Translating loop syntax from Stata to R

I need to write a for loop to calculate the product of year variables (e.g. var1874) * price variables (e.g. num1874), creating a new variable for each year and its corresponding price value (e.g. newvar1874).
Here's my data in R
A tibble: 4 x 7
cty var1874 var1875 var1876 num1874 num1875 num1876
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 0.78 0.83 0.99 2.64 2.8 3.1
2 2 0.69 0.69 0.89 2.3 2.3 2.58
3 3 0.42 0.48 0.59 2.28 2.44 2.64
4 4 0.82 0.94 1.09 2.28 2.36 3
I've been able to do this using the 'foreach' loop in Stata:
local vn 1874 1875 1876
foreach v of local vn {
gen newvar'v' = var'v'*num'v'
Does anyone know how I would do this same type of command using the for loop in R? I know there may be simpler ways to do this without the for loop, but I need to know how to do this using the for loop.
Using a for loop you could do:
vn <- 1874:1876
for (v in vn) d[[paste0("newvar", v)]] <- d[[paste0("var", v)]] * d[[paste0("num", v)]]
d
#> cty var1874 var1875 var1876 num1874 num1875 num1876 newvar1874 newvar1875
#> 1 1 0.78 0.83 0.99 2.64 2.80 3.10 2.0592 2.3240
#> 2 2 0.69 0.69 0.89 2.30 2.30 2.58 1.5870 1.5870
#> 3 3 0.42 0.48 0.59 2.28 2.44 2.64 0.9576 1.1712
#> 4 4 0.82 0.94 1.09 2.28 2.36 3.00 1.8696 2.2184
#> newvar1876
#> 1 3.0690
#> 2 2.2962
#> 3 1.5576
#> 4 3.2700
Or using lapply you could do:
d[, paste0("newvar", vn)] <- lapply(vn, function(v) d[[paste0("var", v)]] * d[[paste0("num", v)]])
DATA
d <- structure(list(
cty = 1:4, var1874 = c(0.78, 0.69, 0.42, 0.82),
var1875 = c(0.83, 0.69, 0.48, 0.94), var1876 = c(
0.99, 0.89,
0.59, 1.09
), num1874 = c(2.64, 2.3, 2.28, 2.28), num1875 = c(
2.8,
2.3, 2.44, 2.36
), num1876 = c(3.1, 2.58, 2.64, 3)
), class = "data.frame", row.names = c(
"1",
"2", "3", "4"
))

Clumpsy code for calculating mean and standard error

I have tried to make a piece of code that calculate the mean and standard error of my data and puts it into a new tibble.
It feels very clumpsy, however. Does anyone know of packages or other tricks that could make my code more elegant?
I need to calculate mean and se for a number of sub groups (days_incubated).
library(dplyr)
library(tibble)
library(tidyr)
library(data.table)
library(plotrix)
df2 <- df1%>%
group_by(days_incubated)%>%
summarise_each(funs(mean, se= std.error))%>% # Calculating mean and standard error
mutate_if(is.numeric, round, digits = 2) # Round off the data
df2_trans <- transpose(df2) # Transposing data table
colnames(df2_trans) <- rownames(df2) # Get row and colnames in order
rownames(df2_trans) <- colnames(df2) # Get row and colnames in order
df2_trans <- rownames_to_column(df2_trans, "mass") # Making row names into a column
df3_trans <- df2_trans%>% # Converting one column into two
separate(mass, c("mass","type"), sep = "([_])")
mean_target <- c("mean", "incubated")
mean <- df3_trans%>% # Mean table
filter(type %in% mean_target)%>%
rename("mean day 0"="1")%>%
rename("mean day 4"="2")%>%
rename("mean day 10"="3")%>%
rename("mean day 17"="4")%>%
rename("mean day 24"="5")%>%
rename("mean day 66"="6")%>%
rename("mean day 81"="7")%>%
rename("mean day 94"="8")%>%
rename("mean day 116"="9")%>%
select("mass", "mean day 0", "mean day 4", "mean day 10", "mean day 17", "mean day 24", "mean day 66", "mean day 81", "mean day 94", "mean day 116")%>%
slice(-c(1))
se_target <- c("se", "incubated")
se <- df3_trans%>% # SE table
filter(type %in% se_target)%>%
rename("se day 0"="1")%>%
rename("se day 4"="2")%>%
rename("se day 10"="3")%>%
rename("se day 17"="4")%>%
rename("se day 24"="5")%>%
rename("se day 66"="6")%>%
rename("se day 81"="7")%>%
rename("se day 94"="8")%>%
rename("se day 116"="9")%>%
select("mass", "se day 0", "se day 4", "se day 10", "se day 17", "se day 24", "se day 66", "se day 81", "se day 94", "se day 116")%>%
slice(-c(1))
# join mean and se tables
mean_se <- mean %>% #merging mean and se dataset
full_join(se, by=("mass"))%>%
select("mass","mean day 0","se day 0", "mean day 4", "se day 4", "mean day 10", "se day 10", "mean day 17", "se day 17", "mean day 24", "se day 24", "mean day 66", "se day 66", "mean day 81", "se day 81", "mean day 94", "se day 94", "mean day 116", "se day 116") # Putting columns in correct order
And here's the data:
df1 <- structure(list(days_incubated = c("0", "0", "0", "0", "0", "4",
"4", "4", "4", "4", "10", "10", "10", "10", "10", "17", "17",
"17", "17", "17", "24", "24", "24", "24", "24", "66", "66", "66",
"66", "66", "81", "81", "81", "81", "81", "94", "94", "94", "94",
"94", "116", "116", "116", "116", "116"), i.x33.031 = c(7.45,
0, 78.2, 16.49, 18.77, 104.5, 28.95, 26.05, 4.11, 62.09, 1.95,
6.75, 1.41, 3.34, 3.02, 0, 100.28, 0.2, 32.66, 0, 0, 370.57,
7.24, 133.63, 55.26, 0.16, 5.5, 25.17, 16.59, 3.3, 23.95, 30.61,
4.04, 0, 6.58, 0.08, 0.01, 0, 0.38, 0, 0, 0, 0, 0.18, 0), i.x35.034 = c(0,
0, 0.15, 0.02, 0.01, 0.04, 0.04, 0.05, 0.02, 0.09, 0.02, 0, 0.04,
0.01, 0, 0, 0.22, 0, 0.08, 0, 0, 0.66, 0.01, 0.2, 0.12, 0.01,
0.01, 0.04, 0.01, 0.01, 0.01, 0.04, 0, 0, 0, 0, 0, 0, 0.01, 0,
0, 0.02, 0, 0, 0.02), i.x36.017 = c(0.47, 0.09, 0.28, 0.02, 0.03,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.05,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.3, 0.06, 0.32, 0, 0, 0, 0, 0.12,
0, 0.02), i.x39.959 = c(0.02, 0, 0.08, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.04, 0, 0, 0, 0, 0, 0.01, 0, 0,
0, 0, 0, 0.01, 0.02, 0.06, 0.03, 0.03, 0, 0, 0.02, 0.01, 0, 0,
0), i.x40.023 = c(0.35, 0.02, 0.48, 0.06, 0, 1.25, 0.09, 0.1,
0.03, 0, 0.09, 0.07, 0.55, 0.09, 0.07, 0, 0.63, 0, 0.09, 0.07,
0.02, 1.11, 0.04, 0.59, 0.13, 0, 0.01, 0.02, 0, 0, 0, 0, 0.01,
0.02, 0.06, 0.01, 0.01, 0.01, 0.01, 0.04, 0, 0.08, 0, 0, 0.01
)), row.names = c(NA, -45L), class = "data.frame")
Does this give you what you are looking for? It seems to reproduce the output. The note there is that I think your result actually has the columns labeled wrong. One of your steps did a character sort on the columns, so they are not in the order you expected when your renamed later.
What we do is first use pivot_longer() so that we have tidy data. From there, we can group and summarize to calculate the mean and standard errors. Then we pivot_wider() to move it back to the wide format of your result.
library(dplyr)
library(tidyr)
library(plotrix)
col_order <- paste0(rep(paste0(c("mean", "se"), "_day_"), length(unique(df1$days_incubated))),
rep(unique(df1$days_incubated), each = 2))
df1 %>%
pivot_longer(-days_incubated, names_to = "mass") %>%
group_by(days_incubated, mass) %>%
summarize(mean = mean(value),
se = std.error(value), .groups = "drop") %>%
pivot_wider(names_from = days_incubated, values_from = c("mean", "se"),
names_glue = "{.value}_day_{days_incubated}") %>%
relocate(mass, all_of(col_order))
# A tibble: 5 x 19
mass mean_day_0 se_day_0 mean_day_4 se_day_4 mean_day_10 se_day_10 mean_day_17 se_day_17 mean_day_24 se_day_24 mean_day_66 se_day_66 mean_day_81
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 i.x3~ 24.2 13.9 45.1 17.5 3.29 0.932 26.6 19.5 113. 68.6 10.1 4.67 13.0
2 i.x3~ 0.036 0.0287 0.048 0.0116 0.014 0.00748 0.06 0.0429 0.198 0.121 0.016 0.006 0.01
3 i.x3~ 0.178 0.0867 0 0 0 0 0 0 0.01 0.01 0 0 0
4 i.x3~ 0.02 0.0155 0 0 0 0 0 0 0.008 0.008 0.002 0.002 0.006
5 i.x4~ 0.182 0.0978 0.294 0.240 0.174 0.0941 0.158 0.119 0.378 0.210 0.006 0.004 0.018
# ... with 5 more variables: se_day_81 <dbl>, mean_day_94 <dbl>, se_day_94 <dbl>, mean_day_116 <dbl>, se_day_116 <dbl>
Line-by-line
First, I have added in a short step to help column sorting.
col_order is a character vector we create of the correct final order we are looking for. Basically we are just concatenating strings to match the column names. You can skip this and simply type each name individually much like you did in your select() statement, but this saves typing.
Now onto the pipeline.
pivot_longer() will take the data from the columns and stack them. The column names will now be in a variable called mass. By default, the values will be in a new variable called value.
group_by() groups on the specified variables, which are the row and column combinations you want in the final table.
summarize() collapses the multiple rows for each mass and days_incubated combination into one new row, with two columns: the mean and se. The .groups = "drop" argument says to drop the groups (i.e., the tibble is no longer grouped, check the help file for more details).
pivot_wider() takes the long formatted summary tibble and pivots the data out into columns, much how you had your format. It says to take the column names from days_incubated, and to take the values from the mean and the se columns. The names_glue argument specifies how to name the new columns. It uses syntax from the glue packages, there the { brackets basically mean "substitute a value here". So it is the '[name of value column]_day_[days_incubated]' as each name.
relocate() simply reorders the columns. This is relatively new to dplyr, and is the preferred way now to rearrange columns instead of select(). It says to first take mass, and then all_of() the columns in the col_order vector that we created earlier.
I'd like to offer a solution mainly in base R.
library(dplyr)
# Define a function that computes the standard error of a vector.
f.SE <- function(x) sd(x)/sqrt(length(x))
# Define a function that compute the standard error of a column (MARGIN = 2 for column).
f.colSE <- function(mydf) apply(mydf, MARGIN = 2, FUN = f.SE)
# Define a function that combines the means and the standard errors of a column
f.colMeanSE <- function(mycol) cbind(means = colMeans(mycol), se = f.colSE(mycol))
# Apply the function to all numeric columns in your data frame by using the days_incubated subgroup as the indices. The result is a list.
my_mean_se1 <- by(data = df1[,-1], INDICES = df1$days_incubated, FUN= f.colMeanSE); my_mean_se1
#df1$days_incubated: 0
#means se
#i.x33.031 24.182 13.91245823
#i.x35.034 0.036 0.02874022
#i.x36.017 0.178 0.08668333
#i.x39.959 0.020 0.01549193
#i.x40.023 0.182 0.09779571
#------------------------------------------------------------
# df1$days_incubated: 10
#means se
#i.x33.031 3.294 0.932119091
#i.x35.034 0.014 0.007483315
#i.x36.017 0.000 0.000000000
#i.x39.959 0.000 0.000000000
#i.x40.023 0.174 0.094106323
#------------------------------------------------------------
# Bind the columns.
my_mean_se2 <- do.call(cbind, my_mean_se1); my_mean_se2
# means se means se means se means se means se means se means se means
#i.x33.031 24.182 13.91245823 3.294 0.932119091 0.036 0.036000000 26.628 19.46476571 113.340 68.5767486 45.140 17.49416646 10.144 4.665935 13.036
#i.x35.034 0.036 0.02874022 0.014 0.007483315 0.008 0.004898979 0.060 0.04289522 0.198 0.1212601 0.048 0.01157584 0.016 0.006000 0.010
#i.x36.017 0.178 0.08668333 0.000 0.000000000 0.028 0.023323808 0.000 0.00000000 0.010 0.0100000 0.000 0.00000000 0.000 0.000000 0.000
#i.x39.959 0.020 0.01549193 0.000 0.000000000 0.006 0.004000000 0.000 0.00000000 0.008 0.0080000 0.000 0.00000000 0.002 0.002000 0.006
#i.x40.023 0.182 0.09779571 0.174 0.094106323 0.018 0.015620499 0.158 0.11939012 0.378 0.2103188 0.294 0.23972067 0.006 0.004000 0.018
# se means se
#i.x33.031 6.002108463 0.094 0.07304793
#i.x35.034 0.007745967 0.002 0.00200000
#i.x36.017 0.000000000 0.136 0.07194442
#i.x39.959 0.004000000 0.024 0.01122497
#i.x40.023 0.011135529 0.016 0.00600000
# Bind the names of the columns to the names of days_incubated
names_days <- unique(df1$days_incubated)
names_meanse <- rbind(paste0("mean day ", names_days), paste0("se day ", names_days))
colnames(my_mean_se2) <- names_meanse
# Round the numbers to 2 decimal places and convert the numeric table to a data frame.
mean_se <- as.data.frame(round(my_mean_se2, digits = 2)); mean_se
# mean day 0 se day 0 mean day 4 se day 4 mean day 10 se day 10 mean day 17
#i.x33.031 24.18 13.91 3.29 0.93 0.04 0.04 26.63
#i.x35.034 0.04 0.03 0.01 0.01 0.01 0.00 0.06
#i.x36.017 0.18 0.09 0.00 0.00 0.03 0.02 0.00
#i.x39.959 0.02 0.02 0.00 0.00 0.01 0.00 0.00
#i.x40.023 0.18 0.10 0.17 0.09 0.02 0.02 0.16
# se day 17 mean day 24 se day 24 mean day 66 se day 66 mean day 81 se day 81
#i.x33.031 19.46 113.34 68.58 45.14 17.49 10.14 4.67
#i.x35.034 0.04 0.20 0.12 0.05 0.01 0.02 0.01
#i.x36.017 0.00 0.01 0.01 0.00 0.00 0.00 0.00
#i.x39.959 0.00 0.01 0.01 0.00 0.00 0.00 0.00
#i.x40.023 0.12 0.38 0.21 0.29 0.24 0.01 0.00
# mean day 94 se day 94 mean day 116 se day 116
#i.x33.031 13.04 6.00 0.09 0.07
#i.x35.034 0.01 0.01 0.00 0.00
#i.x36.017 0.00 0.00 0.14 0.07
#i.x39.959 0.01 0.00 0.02 0.01
#i.x40.023 0.02 0.01 0.02 0.01
# Convert the data frame to a tibble with rowname set as a new column
as_tibble(mean_se, rownames = "rowname")
# A tibble: 5 x 19
# rowname `mean day 0` `se day 0` `mean day 4` `se day 4` `mean day 10` `se day 10` `mean day 17` `se day 17` `mean day 24` `se day 24`
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 i.x33.~ 24.2 13.9 3.29 0.93 0.04 0.04 26.6 19.5 113. 68.6
#2 i.x35.~ 0.04 0.03 0.01 0.01 0.01 0 0.06 0.04 0.2 0.12
#3 i.x36.~ 0.18 0.09 0 0 0.03 0.02 0 0 0.01 0.01
#4 i.x39.~ 0.02 0.02 0 0 0.01 0 0 0 0.01 0.01
#5 i.x40.~ 0.18 0.1 0.17 0.09 0.02 0.02 0.16 0.12 0.38 0.21
# ... with 8 more variables: `mean day 66` <dbl>, `se day 66` <dbl>, `mean day 81` <dbl>, `se day 81` <dbl>, `mean day 94` <dbl>, `se day
# 94` <dbl>, `mean day 116` <dbl>, `se day 116` <dbl>
I think what you started off with was in right direction. You can next get data in long format, change the column names the way you want it and get the data back in wide format.
library(dplyr)
library(tidyr)
df1 %>%
group_by(days_incubated = as.numeric(days_incubated)) %>%
summarise(across(i.x33.031:i.x40.023, list(mean = mean, se = plotrix::std.error))) %>%
pivot_longer(cols = -days_incubated,
names_to = c('mass', 'col'),
names_sep = '_') %>%
#If you need column exactly as shown
mutate(col = paste(col, 'day')) %>%
pivot_wider(names_from = c(col, days_incubated), values_from = value, names_sep = ' ')
# mass `mean day 0` `se day 0` `mean day 4` `se day 4` `mean day 10` `se day 10`
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 i.x3… 24.2 13.9 45.1 17.5 3.29 0.932
#2 i.x3… 0.036 0.0287 0.048 0.0116 0.014 0.00748
#3 i.x3… 0.178 0.0867 0 0 0 0
#4 i.x3… 0.02 0.0155 0 0 0 0
#5 i.x4… 0.182 0.0978 0.294 0.240 0.174 0.0941
# … with 12 more variables: `mean day 17` <dbl>, `se day 17` <dbl>, `mean day
# 24` <dbl>, `se day 24` <dbl>, `mean day 66` <dbl>, `se day 66` <dbl>, `mean day
# 81` <dbl>, `se day 81` <dbl>, `mean day 94` <dbl>, `se day 94` <dbl>, `mean day
# 116` <dbl>, `se day 116` <dbl>

Computing mean of different columns depending on date

My data set is about forest fires and NDVI values (a value ranging from 0 to 1, indicating how green is the surface). It has an initial column which says when the forest fire of row one took place, and subsequent columns indicating the NDVI value on different dates, before and after the fire happened. NDVI values before the fire are substantially higher compared with values after the fire. Something like:
data1989 <- data.frame("date_fire" = c("1987-01-01", "1987-07-03", "1988-01-01"),
"1986-01-01" = c(0.5, 0.589, 0.66),
"1986-06-03" = c(0.56, 0.447, 0.75),
"1986-10-19" = c(0.8, NA, 0.83),
"1987-01-19" = c(0.75, 0.65,0.75),
"1987-06-19" = c(0.1, 0.55,0.811),
"1987-10-19" = c(0.15, 0.12, 0.780),
"1988-01-19" = c(0.2, 0.22,0.32),
"1988-06-19" = c(0.18, 0.21,0.23),
"1988-10-19" = c(0.21, 0.24, 0.250),
stringsAsFactors = FALSE)
> data1989
date_fire X1986.01.01 X1986.06.03 X1986.10.19 X1987.01.19 X1987.06.19 X1987.10.19 X1988.01.19 X1988.06.19 X1988.10.19
1 1987-01-01 0.500 0.560 0.80 0.75 0.100 0.15 0.20 0.18 0.21
2 1987-07-03 0.589 0.447 NA 0.65 0.550 0.12 0.22 0.21 0.24
3 1988-01-01 0.660 0.750 0.83 0.75 0.811 0.78 0.32 0.23 0.25
I would like to compute the average of NDVI values, in a new column, PRIOR to the forest fire. In case one, it would be the average of columns 2, 3, 4 and 5.
What I need to get is:
date_fire X1986.01.01 X1986.06.03 X1986.10.19 X1987.01.19 X1987.06.19 X1987.10.19 X1988.01.19 X1988.06.19 X1988.10.19 meanPreFire
1 1987-01-01 0.500 0.560 0.80 0.75 0.100 0.15 0.20 0.18 0.21 0.653
2 1987-07-03 0.589 0.447 NA 0.65 0.550 0.12 0.22 0.21 0.24 0.559
3 1988-01-01 0.660 0.750 0.83 0.75 0.811 0.78 0.32 0.23 0.25 0.764
Thanks!
EDIT: SOLUTION
How to adapt the code with more than one column to exclude:
data1989 <- data.frame("date_fire" = c("1987-02-01", "1987-07-03", "1988-01-01"),
"type" = c("oak", "pine", "oak"),
"meanRainfall" = c(600, 300, 450),
"1986.01.01" = c(0.5, 0.589, 0.66),
"1986.06.03" = c(0.56, 0.447, 0.75),
"1986.10.19" = c(0.8, NA, 0.83),
"1987.01.19" = c(0.75, 0.65,0.75),
"1987.06.19" = c(0.1, 0.55,0.811),
"1987.10.19" = c(0.15, 0.12, 0.780),
"1988.01.19" = c(0.2, 0.22,0.32),
"1988.06.19" = c(0.18, 0.21,0.23),
"1988.10.19" = c(0.21, 0.24, 0.250),
check.names = FALSE,
stringsAsFactors = FALSE)
Using:
j1 <- findInterval(as.Date(data1989$date_fire), as.Date(names(data1989)[-(1:3)],format="%Y.%m.%d"))
m1 <- cbind(rep(seq_len(nrow(data1989)), j1), sequence(j1))
data1989$meanPreFire <- tapply(data1989[-(1:3)][m1], m1[,1], FUN = mean, na.rm = TRUE)
> data1989
date_fire type meanRainfall 1986.01.01 1986.06.03 1986.10.19 1987.01.19 1987.06.19 1987.10.19 1988.01.19 1988.06.19 1988.10.19 meanPreFire
1 1987-02-01 oak 600 0.500 0.560 0.80 0.75 0.100 0.15 0.20 0.18 0.21 0.6525
2 1987-07-03 pine 300 0.589 0.447 NA 0.65 0.550 0.12 0.22 0.21 0.24 0.5590
3 1988-01-01 oak 450 0.660 0.750 0.83 0.75 0.811 0.78 0.32 0.23 0.25 0.7635
Reshape data to the long form and filter dates prior to the forest fire.
library(tidyverse)
data1989 %>%
pivot_longer(-date_fire, names_to = "date") %>%
mutate(date_fire = as.Date(date_fire),
date = as.Date(date, "X%Y.%m.%d")) %>%
filter(date < date_fire) %>%
group_by(date_fire) %>%
summarise(meanPreFire = mean(value, na.rm = T))
# # A tibble: 3 x 2
# date_fire meanPreFire
# <date> <dbl>
# 1 1987-01-01 0.62
# 2 1987-07-03 0.559
# 3 1988-01-01 0.764
The solution would be much more concise if we would keep the data in long(er) form... but this reproduces the desired output:
library(dplyr)
library(tidyr)
data1989 %>%
pivot_longer(-date_fire, names_to = "date_NDVI", values_to = "value", names_prefix = "^X") %>%
mutate(date_fire = as.Date(date_fire, "%Y-%m-%d"),
date_NDVI = as.Date(date_NDVI, "%Y.%m.%d")) %>%
group_by(date_fire) %>%
mutate(period = ifelse(date_NDVI < date_fire, "before_fire", "after_fire")) %>%
group_by(date_fire, period) %>%
mutate(average_NDVI = mean(value, na.rm = TRUE)) %>%
pivot_wider(names_from = date_NDVI, names_prefix = "X", values_from = value) %>%
pivot_wider(names_from = period, values_from = average_NDVI) %>%
group_by(date_fire) %>%
summarise_all(funs(sum(., na.rm=T)))
Returns:
# A tibble: 3 x 12
date_fire `X1986-01-01` `X1986-06-03` `X1986-10-19` `X1987-01-19` `X1987-06-19` `X1987-10-19` `X1988-01-19` `X1988-06-19` `X1988-10-19` before_fire after_fire
<date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1987-01-01 0.5 0.56 0.8 0.75 0.1 0.15 0.2 0.18 0.21 0.62 0.265
2 1987-07-03 0.589 0.447 0 0.65 0.55 0.12 0.22 0.21 0.24 0.559 0.198
3 1988-01-01 0.66 0.75 0.83 0.75 0.811 0.78 0.32 0.23 0.25 0.764 0.267
Edit:
If we stop the expression right after calculating the averages we can use the data in this structure to easily calculate the variance or account for variable number of observations. I think it's ok to keep the date_fireas its own column, but I'd suggest leaving the other dates as a column (because they correspond to observations). Especially if we want to do more analysis with the data using ggplot2 and other tidyverse functions.
We can use base R, by creating a row/column index. The column index can be got from findInterval with the column names and the 'date_fire'
j1 <- findInterval(as.Date(data1989$date_fire), as.Date(names(data1989)[-1]))
l1 <- lapply(j1+1, `:`, ncol(data1989)-1)
m1 <- cbind(rep(seq_len(nrow(data1989)), j1), sequence(j1))
m2 <- cbind(rep(seq_len(nrow(data1989)), lengths(l1)), unlist(l1))
data1989$meanPreFire <- tapply(data1989[-1][m1], m1[,1], FUN = mean, na.rm = TRUE)
data1989$meanPostFire <- tapply(data1989[-1][m2], m2[,1], FUN = mean, na.rm = TRUE)
data1989
# date_fire 1986-01-01 1986-06-03 1986-10-19 1987-01-19 1987-06-19 1987-10-19 1988-01-19 1988-06-19 1988-10-19
#1 1987-01-01 0.500 0.560 0.80 0.75 0.100 0.15 0.20 0.18 0.21
#2 1987-07-03 0.589 0.447 NA 0.65 0.550 0.12 0.22 0.21 0.24
#3 1988-01-01 0.660 0.750 0.83 0.75 0.811 0.78 0.32 0.23 0.25
# meanPreFire meanPostFire
#1 0.6200 0.2650000
#2 0.5590 0.1975000
#3 0.7635 0.2666667
Or using melt/dcast from data.table
library(data.table)
dcast(melt(setDT(data1989), id.var = 'date_fire')[,
.(value = mean(value, na.rm = TRUE)),
.(date_fire, grp = c('postFire', 'preFire')[1 + (as.IDate(variable) < as.IDate(date_fire))]) ], date_fire ~ grp)[data1989, on = .(date_fire)]
# date_fire postFire preFire 1986-01-01 1986-06-03 1986-10-19 1987-01-19 1987-06-19 1987-10-19 1988-01-19 1988-06-19
#1: 1987-01-01 0.2650000 0.6200 0.500 0.560 0.80 0.75 0.100 0.15 0.20 0.18
#2: 1987-07-03 0.1975000 0.5590 0.589 0.447 NA 0.65 0.550 0.12 0.22 0.21
#3: 1988-01-01 0.2666667 0.7635 0.660 0.750 0.83 0.75 0.811 0.78 0.32 0.23
# 1988-10-19
#1: 0.21
#2: 0.24
#3: 0.25
data
data1989 <- data.frame("date_fire" = c("1987-01-01", "1987-07-03", "1988-01-01"),
"1986-01-01" = c(0.5, 0.589, 0.66),
"1986-06-03" = c(0.56, 0.447, 0.75),
"1986-10-19" = c(0.8, NA, 0.83),
"1987-01-19" = c(0.75, 0.65,0.75),
"1987-06-19" = c(0.1, 0.55,0.811),
"1987-10-19" = c(0.15, 0.12, 0.780),
"1988-01-19" = c(0.2, 0.22,0.32),
"1988-06-19" = c(0.18, 0.21,0.23),
"1988-10-19" = c(0.21, 0.24, 0.250), check.names = FALSE,
stringsAsFactors = FALSE)

How can I easily combine the output of grouped summaries with an overall output for the data

I've used group_by with the summarise command in dplyr to generate some summaries for my data. I would like to get the same summaries for the overall data set and combine it as one tibble.
Is there a straighforward way of doing this? My solution below feels like it has 4X the amount of code required to do this efficently!
Thanks in advance.
# reprex
library(tidyverse)
tidy_data <- tibble::tribble(
~drug, ~gender, ~condition, ~value,
"control", "f", "work", 0.06,
"treatment", "m", "work", 0.42,
"treatment", "f", "work", 0.22,
"control", "m", "work", 0.38,
"treatment", "m", "work", 0.57,
"treatment", "f", "work", 0.24,
"control", "f", "work", 0.61,
"control", "f", "play", 0.27,
"treatment", "m", "play", 0.3,
"treatment", "f", "play", 0.09,
"control", "m", "play", 0.84,
"control", "m", "play", 0.65,
"treatment", "m", "play", 0.98,
"treatment", "f", "play", 0.38
)
tidy_summaries <- tidy_data %>%
# Group by the required variables
group_by(drug, gender, condition) %>%
summarise(mean = mean(value),
median = median(value),
min = min(value),
max = max(value)) %>%
# Bind rows will bind this output to the following one
bind_rows(
# Now for the overall version
tidy_data %>%
# Generate the overall summary values
mutate(mean = mean(value),
median = median(value),
min = min(value),
max = max(value)) %>%
# We need to know what the structure of the 'grouped_by' tibble first
# as the overall output format needs to match that
select(drug, gender, condition, mean:max) %>% # Keep columns of interest
# The same information will be appended to all rows, so we just need to retain one
filter(row_number() == 1) %>%
# Change the values in drug, gender, condition to "overall"
mutate_at(vars(drug:condition),
list(~ifelse(is.character(.), "overall", .)))
)
This the output I want, but it wasn't as simple as I might have hoped.
tidy_summaries
#> # A tibble: 9 x 7
#> # Groups: drug, gender [5]
#> drug gender condition mean median min max
#> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 control f play 0.27 0.27 0.27 0.27
#> 2 control f work 0.335 0.335 0.06 0.61
#> 3 control m play 0.745 0.745 0.65 0.84
#> 4 control m work 0.38 0.38 0.38 0.38
#> 5 treatment f play 0.235 0.235 0.09 0.38
#> 6 treatment f work 0.23 0.23 0.22 0.24
#> 7 treatment m play 0.64 0.64 0.3 0.98
#> 8 treatment m work 0.495 0.495 0.42 0.570
#> 9 overall overall overall 0.429 0.38 0.06 0.98
Try
tidy_data %>%
group_by(drug, gender, condition) %>%
summarise(mean = mean(value), median = median(value), min = min(value), max = max(value)) %>%
bind_rows(.,
tidy_data %>%
summarise(drug = "Overall", gender = "Overall", condition = "Overall", mean = mean(value), median = median(value), min = min(value), max = max(value))
)
This gives:
# A tibble: 9 x 7
# Groups: drug, gender [5]
drug gender condition mean median min max
<chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 control f play 0.27 0.27 0.27 0.27
2 control f work 0.335 0.335 0.06 0.61
3 control m play 0.745 0.745 0.65 0.84
4 control m work 0.38 0.38 0.38 0.38
5 treatment f play 0.235 0.235 0.09 0.38
6 treatment f work 0.23 0.23 0.22 0.24
7 treatment m play 0.64 0.64 0.3 0.98
8 treatment m work 0.495 0.495 0.42 0.570
9 Overall Overall Overall 0.429 0.38 0.06 0.98
The code summarizes it via groupings first, and then creates the final summary row from the original data and binds it at the very bottom.
Interesting question. My take is basically the same answer as #sumshyftw but uses mutate_if and summarise_at.
Code
library(hablar)
funs <- list(mean = ~mean(.),
median = ~median(.),
min = ~min(.),
max = ~max(.))
tidy_data %>%
group_by(drug, gender, condition) %>%
summarise_at(vars(value), funs) %>%
ungroup() %>%
bind_rows(., tidy_data %>% summarise_at(vars(value), funs)) %>%
mutate_if(is.character, ~if_na(., "Overall"))
Result
drug gender condition mean median min max
<chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 control f play 0.27 0.27 0.27 0.27
2 control f work 0.335 0.335 0.06 0.61
3 control m play 0.745 0.745 0.65 0.84
4 control m work 0.38 0.38 0.38 0.38
5 treatment f play 0.235 0.235 0.09 0.38
6 treatment f work 0.23 0.23 0.22 0.24
7 treatment m play 0.64 0.64 0.3 0.98
8 treatment m work 0.495 0.495 0.42 0.570
9 Overall Overall Overall 0.429 0.38 0.06 0.98

Strip leading zero from numeric vector without changing class

I have the following data, which is a few Major League Baseball statistics.
Year AVG SLG TB OBP IsoPow RC
1 1986 0.223 0.300 172 0.330 0.194 64.1
2 1987 0.261 0.356 271 0.329 0.230 92.8
3 1988 0.283 0.357 264 0.368 0.208 100.0
4 1989 0.248 0.328 247 0.351 0.178 91.9
5 1990 0.301 0.374 293 0.406 0.264 128.0
6 1991 0.292 0.367 262 0.410 0.222 118.2
Usually, percentage-type MLB statistics are displayed as a decimal, but with the leading zero removed. I'd like to do the same, but also preserve the class of the variable, which in this case is numeric.
For example, with bonds$AVG I'd like the result to be a numeric vector that looks exactly like
[1] .223 .261 .283 .248 .301 .292
Using sub, the vector goes from numeric to character, then back to its original numeric state after wrapping it with as.numeric.
> sub(0, "", bonds$AVG)
# [1] ".223" ".261" ".283" ".248" ".301" ".292"
> as.numeric(sub(0, "", bonds$AVG))
# [1] 0.223 0.261 0.283 0.248 0.301 0.292
Is this possible in R?
bonds <-
structure(list(Year = c(1986, 1987, 1988, 1989, 1990, 1991),
AVG = c(0.223, 0.261, 0.283, 0.248, 0.301, 0.292), SLG = c(0.3,
0.356, 0.357, 0.328, 0.374, 0.367), TB = c(172, 271, 264,
247, 293, 262), OBP = c(0.33, 0.329, 0.368, 0.351, 0.406,
0.41), IsoPow = c(0.194, 0.23, 0.208, 0.178, 0.264, 0.222
), RC = c(64.1, 92.8, 100, 91.9, 128, 118.2)), .Names = c("Year",
"AVG", "SLG", "TB", "OBP", "IsoPow", "RC"), row.names = c(NA,
6L), class = "data.frame")
Perhaps you could generalize the following by modifying print.data.frame?
f1 <- function(x) noquote(sub(0, "", x))
f1(bonds$AVG)
.223 .261 .283 .248 .301 .292

Resources