pivot_longer with column pairs - r

I am again struggling with transforming a wide df into a long one using pivot_longer The data frame is a result of power analysis for different effect sizes and sample sizes, this is how the original df looks like:
es_issue_owner es_independence es_party pwr_issue_owner_1200 pwr_independence_1200 pwr_party_1200 pwr_issue_owner_2400 pwr_independence_2400 pwr_party_2400
1 0.1 0.1 0.1 0.087 0.080 0.081 0.130 0.163 0.102
2 0.2 0.2 0.2 0.235 0.273 0.157 0.406 0.513 0.267
Or with dput:
example <- structure(list(es_issue_owner = c(0.1, 0.2), es_independence = c(0.1,
0.2), es_party = c(0.1, 0.2), pwr_issue_owner_1200 = c(0.087,
0.235), pwr_independence_1200 = c(0.08, 0.273), pwr_party_1200 = c(0.081,
0.157), pwr_issue_owner_2400 = c(0.13, 0.406), pwr_independence_2400 = c(0.163,
0.513), pwr_party_2400 = c(0.102, 0.267)), row.names = 1:2, class = "data.frame")
Each effect size (es) for three meassures ("independence", "issueowner", "party") is paired with a power calculation on a 1200 and on a 2400 sample size. This is how the output I want to get would look like based on the example above:
type es pwr value
1 independence 0.1 1200 0.080
2 issue_owner 0.1 1200 0.087
3 party 0.1 1200 0.081
4 independence 0.2 1200 0.273
5 issue_owner 0.2 1200 0.235
6 party 0.2 1200 0.157
7 independence 0.1 2400 0.163
8 issue_owner 0.1 2400 0.130
9 party 0.1 2400 0.102
10 independence 0.2 2400 0.513
11 issue_owner 0.2 2400 0.406
12 party 0.2 2400 0.267
or, with dput:
output <- structure(list(type = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L,
2L, 3L, 1L, 2L, 3L), .Label = c("independence", "issueowner",
"party"), class = "factor"), es = c(0.1, 0.1, 0.1, 0.2, 0.2,
0.2, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2), pwr = c(1200, 1200, 1200,
1200, 1200, 1200, 2400, 2400, 2400, 2400, 2400, 2400), value = c("0.080",
"0.087", "0.081", "0.273", "0.235", "0.157", "0.163", "0.130",
"0.102", "0.513", "0.406", "0.267")), out.attrs = list(dim = c(type = 3L,
es = 2L, pwr = 2L, value = 1L), dimnames = list(type = c("type=independence",
"type=issueowner", "type=party"), es = c("es=0.1", "es=0.2"),
pwr = c("pwr=1200", "pwr=2400"), value = "value=NA")), class = "data.frame", row.names = c(NA,
-12L))
As a start I tried experimenting with this:
example %>%
pivot_longer(cols = everything(),
names_pattern = "(es_[A-Za-z]+)(pwr_[A-Za-z]+_1200)(pwr_[A-Za-z]+_2400)",
# names_sep = "(?=\\d)_(?=\\d)",
names_to = c("es", "pwr_1200", "pwr_2400"),
values_to = "value")
But it did not work, so I tried from two steps, which sort of works, but the "pairing" gets messed up:
example %>%
# pivot_longer(cols = everything(),
# names_pattern = "(es_[A-Za-z]+)(pwr_[A-Za-z]+_1200)(pwr_[A-Za-z]+_2400)",
# # names_sep = "(?=\\d)_(?=\\d)",
# names_to = c("es", "pwr_1200", "pwr_2400"),
# values_to = "value")
pivot_longer(cols = contains("pwr_"),
# names_pattern = "es_pwr(.*)1200_pwr(.*)2400",
names_sep = "_(?=\\d)",
names_to = c("pwr_type", "pwr_sample"), values_to = "value") %>%
pivot_longer(cols = contains("es_"),
# names_pattern = "es_pwr(.*)1200_pwr(.*)2400",
# names_sep = "_(?=\\d)",
names_to = "es_type", values_to = "es")
I would appreciate any help!

library(tidyverse)
example %>%
pivot_longer(cols = starts_with("es"), names_to = "type", names_prefix = "es_", values_to = "es") %>%
pivot_longer(cols = starts_with("pwr"), names_to = "pwr", names_prefix = "pwr_") %>%
filter(substr(type, 1, 3) == substr(pwr, 1, 3)) %>%
mutate(pwr = parse_number(pwr)) %>%
arrange(pwr, es, type)
output
type es pwr value
1 independence 0.1 1200 0.08
2 issue_owner 0.1 1200 0.087
3 party 0.1 1200 0.081
4 independence 0.2 1200 0.273
5 issue_owner 0.2 1200 0.235
6 party 0.2 1200 0.157
7 independence 0.1 2400 0.163
8 issue_owner 0.1 2400 0.13
9 party 0.1 2400 0.102
10 independence 0.2 2400 0.513
11 issue_owner 0.2 2400 0.406
12 party 0.2 2400 0.267

Related

How do I get gt table to work with get_summary_stats

Dataset
Here is the dput head for my dataset:
structure(list(Year = c("2021", "2021", "2021", "2021", "2021",
"2021"), Month_Number = c("9", "9", "9", "9", "9", "9"), Month_Name = c("September",
"September", "September", "September", "September", "September"
), Day_Name = c("Wednesday", "Thursday", "Friday", "Saturday",
"Sunday", "Monday"), Time_Wake = c(500L, 715L, 600L, 600L, 700L,
600L), Mins_Sleep = c(300L, 540L, 540L, 480L, 480L, 480L), Start_Work = c(1015L,
1000L, 945L, 1400L, 1500L, 915L), End_Work = c(1800L, 1600L,
1210L, 1700L, 1515L, 1530L), Workout_Y_N = c("Y", "Y", "Y", "N",
"N", "N"), Time_Workout = c(730L, 730L, 730L, NA, NA, NA), Work_Environment = c("Office",
"Office", "Office", "Home", "Home", "Office"), Coffee_Cups = c(3L,
0L, 2L, 6L, 4L, 5L), Tea_Cups = c(2L, 4L, 2L, 0L, 0L, 2L), Mins_Work = c(435L,
350L, 145L, 135L, 15L, 60L)), row.names = c(NA, 6L), class = "data.frame")
Script
I'm trying to run a table on my summary stats but having an issue with using the gt library to produce it. Here is my script:
library(tidyverse)
library(gt)
library(rstatix)
work %>%
get_summary_stats() %>%
gt() %>%
tab_header(
title = md("The ***productivity*** dataset"),
subtitle = md("Descriptives of five months of data on productivity, consumption, and sleep patterns.")
) %>%
cols_label(
n = md("Cases (N)"),
min = md("Min"),
max = md("Max"),
median = md("Median"),
q1 = md("Q1"),
q3 = md("Q3"),
iqr = md("IQR"),
mad = md("MAD"),
mean = md("Mean"),
sd = md("SD"),
se = md("SE"),
ci = md("CI")
) %>%
opt_align_table_header(align = "center") %>%
tab_source_note(
source_note = md("*Data obtained from local matrix.*")
) %>%
tab_footnote(
footnote = "Not a standardized unit.",
locations = cells_stub(rows ="Coffee_Cups")
)
Problem
The issue is that while this script produce the table, I'm unable to get the footnote to work, as it doesn't recognize "Coffee_Cups" as a row. This is what it looks like without the tab_footnote command:
This is the error it gives me, which doesn't help much:
Error: The following row(s) do not exist in the data: Coffee_Cups
I've tried tinkering with the other subcommands in tab_footnote but don't get any successful runs. What should I do?
I comment here because I need more characters.
If you run the following:
> w$`_data`
# A tibble: 8 × 13
variable n min max median q1 q3 iqr mad mean sd se ci
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Coffee_Cups 6 0 6 3.5 2.25 4.75 2.5 2.22 3.33 2.16 0.882 2.27
2 End_Work 6 1210 1800 1565 1519. 1675 156. 137. 1559. 202. 82.5 212.
3 Mins_Sleep 6 300 540 480 480 525 45 44.5 470 88.3 36.1 92.7
4 Mins_Work 6 15 435 140 78.8 299. 220 152. 190 166. 67.9 174.
5 Start_Work 6 915 1500 1008. 959. 1304. 345 115. 1129. 253. 103. 266.
6 Tea_Cups 6 0 4 2 0.5 2 1.5 1.48 1.67 1.51 0.615 1.58
7 Time_Wake 6 500 715 600 600 675 75 74.1 619. 78.8 32.2 82.7
8 Time_Workout 3 730 730 730 730 730 0 0 730 0 0 0
There you can clearly see that Coffee_Cups is a value inside a column, not a row.
I tried changing the code like this:
tab_footnote(
footnote = "Not a standardized unit.",
locations = cells_stub(rows = c(1))
)
And I get the following error:
Error: Can't use NA as column index with `[` at position 1.
Run `rlang::last_error()` to see where the error occurred.
> rlang::last_error()
<error/tibble_error_na_column_index>
Can't use NA as column index with `[` at position 1.
Backtrace:
1. (function (x, ...) ...
2. gt:::print.gt_tbl(x)
3. gt:::as.tags.gt_tbl(x, ...)
4. gt:::render_as_html(data = x)
5. gt:::build_data(data = data, context = "html")
6. gt:::apply_footnotes_to_output(data = data, context = context)
8. tibble:::`[.tbl_df`(body, footnotes_data_marks$rownum[i], footnotes_data_marks$colname[i])
Run `rlang::last_trace()` to see the full context.
At this point we know that the row was taken, now we need to figure out the problem with the column not matching ...
I could make it work using the following:
works %>%
get_summary_stats() %>%
gt() %>%
tab_header(
title = md("The ***productivity*** dataset"),
subtitle = md("Descriptives of five months of data on productivity, consumption, and sleep patterns.")
) %>%
cols_label(
n = md("Cases (N)"),
min = md("Min"),
max = md("Max"),
median = md("Median"),
q1 = md("Q1"),
q3 = md("Q3"),
iqr = md("IQR"),
mad = md("MAD"),
mean = md("Mean"),
sd = md("SD"),
se = md("SE"),
ci = md("CI")
) %>%
opt_align_table_header(align = "center") %>%
tab_source_note(
source_note = md("*Data obtained from local matrix.*")
) %>%
tab_footnote(
footnote = "Not a standardized unit.",
#locations = cells_stub(rows = everything(vars = "Coffee_Cups"))
locations = cells_body(1,1)
)

In R, is there a way to to gather a data frame while only extracting part of the column name? [duplicate]

This question already has an answer here:
How to use Pivot_longer to reshape from wide-type data to long-type data with multiple variables
(1 answer)
Closed 2 years ago.
Overview
So, I'm looking to tidy my data frame. I have found a solution to my problem but it seems highly inefficient when I am working with my large dataset. Currently my code gathers my data frame, applies a separate function to split the ticker from the metric, and then spreads the data appropriately. See the example below
Data frame
structure(list(date = c("2009-07-01", "2009-07-02", "2009-07-06",
"2009-07-07", "2009-07-08"), PRED.Open = c(0.5, 0.5, 0.7, 0.7,
0.7), PRED.High = c(0.5, 0.6, 0.7, 0.7, 0.7), PRED.Low = c(0.5,
0.5, 0.5, 0.7, 0.7), PRED.Close = c(0.5, 0.6, 0.5, 0.7, 0.7),
PRED.Volume = c(0L, 300L, 200L, 0L, 0L), PRED.Adjusted = c(0.5,
0.6, 0.5, 0.7, 0.7), GDM.Open = c(1041.02002, 1085.109985,
1052.02002, 1011.429993, 1006.630005), GDM.High = c(1097.790039,
1085.109985, 1052.02002, 1029.290039, 1006.630005), GDM.Low = c(1041.02002,
1038.540039, 995.450012, 1005.280029, 948.73999), GDM.Close = c(1085.109985,
1052.02002, 1011.429993, 1006.630005, 966.22998), GDM.Volume = c(0L,
0L, 0L, 0L, 0L), GDM.Adjusted = c(1085.109985, 1052.02002,
1011.429993, 1006.630005, 966.22998), NBL.Open = c(29.885,
29.325001, 27.370001, 27.485001, 26.815001), NBL.High = c(30.35,
29.325001, 27.545, 27.610001, 27.18), NBL.Low = c(29.83,
28.07, 26.825001, 26.605, 25.745001)), row.names = c(NA,
-5L), class = "data.frame")
Current Solution
df <- df %>% gather(c(2:ncol(df)), key = "ticker", value = "val")
df <- separate(df, col = "ticker", into = c("ticker", "metric"), sep = "\\.") %>%
ungroup() %>%
spread(key = "metric", value = "val") %>%
arrange(ticker, date)
Desired Outcome
Question
Is there a more efficient way to accomplish this?
If you use pivot_longer from tidyr 1.0.0 you can do this in one line :
tidyr::pivot_longer(df,
cols = -date,
names_to = c('ticker', '.value'),
names_sep = '\\.') %>%
dplyr::arrange(ticker, date)
# A tibble: 15 x 8
# date ticker Open High Low Close Volume Adjusted
# <chr> <chr> <dbl> <dbl> <dbl> <dbl> <int> <dbl>
# 1 2009-07-01 GDM 1041.0 1097.8 1041.0 1085.1 0 1085.1
# 2 2009-07-02 GDM 1085.1 1085.1 1038.5 1052.0 0 1052.0
# 3 2009-07-06 GDM 1052.0 1052.0 995.45 1011.4 0 1011.4
# 4 2009-07-07 GDM 1011.4 1029.3 1005.3 1006.6 0 1006.6
# 5 2009-07-08 GDM 1006.6 1006.6 948.74 966.23 0 966.23
# 6 2009-07-01 NBL 29.885 30.35 29.83 NA NA NA
# 7 2009-07-02 NBL 29.325 29.325 28.07 NA NA NA
# 8 2009-07-06 NBL 27.370 27.545 26.825 NA NA NA
# 9 2009-07-07 NBL 27.485 27.610 26.605 NA NA NA
#10 2009-07-08 NBL 26.815 27.18 25.745 NA NA NA
#11 2009-07-01 PRED 0.5 0.5 0.5 0.5 0 0.5
#12 2009-07-02 PRED 0.5 0.6 0.5 0.6 300 0.6
#13 2009-07-06 PRED 0.7 0.7 0.5 0.5 200 0.5
#14 2009-07-07 PRED 0.7 0.7 0.7 0.7 0 0.7
#15 2009-07-08 PRED 0.7 0.7 0.7 0.7 0 0.7

Computing mean of different columns depending on date

My data set is about forest fires and NDVI values (a value ranging from 0 to 1, indicating how green is the surface). It has an initial column which says when the forest fire of row one took place, and subsequent columns indicating the NDVI value on different dates, before and after the fire happened. NDVI values before the fire are substantially higher compared with values after the fire. Something like:
data1989 <- data.frame("date_fire" = c("1987-01-01", "1987-07-03", "1988-01-01"),
"1986-01-01" = c(0.5, 0.589, 0.66),
"1986-06-03" = c(0.56, 0.447, 0.75),
"1986-10-19" = c(0.8, NA, 0.83),
"1987-01-19" = c(0.75, 0.65,0.75),
"1987-06-19" = c(0.1, 0.55,0.811),
"1987-10-19" = c(0.15, 0.12, 0.780),
"1988-01-19" = c(0.2, 0.22,0.32),
"1988-06-19" = c(0.18, 0.21,0.23),
"1988-10-19" = c(0.21, 0.24, 0.250),
stringsAsFactors = FALSE)
> data1989
date_fire X1986.01.01 X1986.06.03 X1986.10.19 X1987.01.19 X1987.06.19 X1987.10.19 X1988.01.19 X1988.06.19 X1988.10.19
1 1987-01-01 0.500 0.560 0.80 0.75 0.100 0.15 0.20 0.18 0.21
2 1987-07-03 0.589 0.447 NA 0.65 0.550 0.12 0.22 0.21 0.24
3 1988-01-01 0.660 0.750 0.83 0.75 0.811 0.78 0.32 0.23 0.25
I would like to compute the average of NDVI values, in a new column, PRIOR to the forest fire. In case one, it would be the average of columns 2, 3, 4 and 5.
What I need to get is:
date_fire X1986.01.01 X1986.06.03 X1986.10.19 X1987.01.19 X1987.06.19 X1987.10.19 X1988.01.19 X1988.06.19 X1988.10.19 meanPreFire
1 1987-01-01 0.500 0.560 0.80 0.75 0.100 0.15 0.20 0.18 0.21 0.653
2 1987-07-03 0.589 0.447 NA 0.65 0.550 0.12 0.22 0.21 0.24 0.559
3 1988-01-01 0.660 0.750 0.83 0.75 0.811 0.78 0.32 0.23 0.25 0.764
Thanks!
EDIT: SOLUTION
How to adapt the code with more than one column to exclude:
data1989 <- data.frame("date_fire" = c("1987-02-01", "1987-07-03", "1988-01-01"),
"type" = c("oak", "pine", "oak"),
"meanRainfall" = c(600, 300, 450),
"1986.01.01" = c(0.5, 0.589, 0.66),
"1986.06.03" = c(0.56, 0.447, 0.75),
"1986.10.19" = c(0.8, NA, 0.83),
"1987.01.19" = c(0.75, 0.65,0.75),
"1987.06.19" = c(0.1, 0.55,0.811),
"1987.10.19" = c(0.15, 0.12, 0.780),
"1988.01.19" = c(0.2, 0.22,0.32),
"1988.06.19" = c(0.18, 0.21,0.23),
"1988.10.19" = c(0.21, 0.24, 0.250),
check.names = FALSE,
stringsAsFactors = FALSE)
Using:
j1 <- findInterval(as.Date(data1989$date_fire), as.Date(names(data1989)[-(1:3)],format="%Y.%m.%d"))
m1 <- cbind(rep(seq_len(nrow(data1989)), j1), sequence(j1))
data1989$meanPreFire <- tapply(data1989[-(1:3)][m1], m1[,1], FUN = mean, na.rm = TRUE)
> data1989
date_fire type meanRainfall 1986.01.01 1986.06.03 1986.10.19 1987.01.19 1987.06.19 1987.10.19 1988.01.19 1988.06.19 1988.10.19 meanPreFire
1 1987-02-01 oak 600 0.500 0.560 0.80 0.75 0.100 0.15 0.20 0.18 0.21 0.6525
2 1987-07-03 pine 300 0.589 0.447 NA 0.65 0.550 0.12 0.22 0.21 0.24 0.5590
3 1988-01-01 oak 450 0.660 0.750 0.83 0.75 0.811 0.78 0.32 0.23 0.25 0.7635
Reshape data to the long form and filter dates prior to the forest fire.
library(tidyverse)
data1989 %>%
pivot_longer(-date_fire, names_to = "date") %>%
mutate(date_fire = as.Date(date_fire),
date = as.Date(date, "X%Y.%m.%d")) %>%
filter(date < date_fire) %>%
group_by(date_fire) %>%
summarise(meanPreFire = mean(value, na.rm = T))
# # A tibble: 3 x 2
# date_fire meanPreFire
# <date> <dbl>
# 1 1987-01-01 0.62
# 2 1987-07-03 0.559
# 3 1988-01-01 0.764
The solution would be much more concise if we would keep the data in long(er) form... but this reproduces the desired output:
library(dplyr)
library(tidyr)
data1989 %>%
pivot_longer(-date_fire, names_to = "date_NDVI", values_to = "value", names_prefix = "^X") %>%
mutate(date_fire = as.Date(date_fire, "%Y-%m-%d"),
date_NDVI = as.Date(date_NDVI, "%Y.%m.%d")) %>%
group_by(date_fire) %>%
mutate(period = ifelse(date_NDVI < date_fire, "before_fire", "after_fire")) %>%
group_by(date_fire, period) %>%
mutate(average_NDVI = mean(value, na.rm = TRUE)) %>%
pivot_wider(names_from = date_NDVI, names_prefix = "X", values_from = value) %>%
pivot_wider(names_from = period, values_from = average_NDVI) %>%
group_by(date_fire) %>%
summarise_all(funs(sum(., na.rm=T)))
Returns:
# A tibble: 3 x 12
date_fire `X1986-01-01` `X1986-06-03` `X1986-10-19` `X1987-01-19` `X1987-06-19` `X1987-10-19` `X1988-01-19` `X1988-06-19` `X1988-10-19` before_fire after_fire
<date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1987-01-01 0.5 0.56 0.8 0.75 0.1 0.15 0.2 0.18 0.21 0.62 0.265
2 1987-07-03 0.589 0.447 0 0.65 0.55 0.12 0.22 0.21 0.24 0.559 0.198
3 1988-01-01 0.66 0.75 0.83 0.75 0.811 0.78 0.32 0.23 0.25 0.764 0.267
Edit:
If we stop the expression right after calculating the averages we can use the data in this structure to easily calculate the variance or account for variable number of observations. I think it's ok to keep the date_fireas its own column, but I'd suggest leaving the other dates as a column (because they correspond to observations). Especially if we want to do more analysis with the data using ggplot2 and other tidyverse functions.
We can use base R, by creating a row/column index. The column index can be got from findInterval with the column names and the 'date_fire'
j1 <- findInterval(as.Date(data1989$date_fire), as.Date(names(data1989)[-1]))
l1 <- lapply(j1+1, `:`, ncol(data1989)-1)
m1 <- cbind(rep(seq_len(nrow(data1989)), j1), sequence(j1))
m2 <- cbind(rep(seq_len(nrow(data1989)), lengths(l1)), unlist(l1))
data1989$meanPreFire <- tapply(data1989[-1][m1], m1[,1], FUN = mean, na.rm = TRUE)
data1989$meanPostFire <- tapply(data1989[-1][m2], m2[,1], FUN = mean, na.rm = TRUE)
data1989
# date_fire 1986-01-01 1986-06-03 1986-10-19 1987-01-19 1987-06-19 1987-10-19 1988-01-19 1988-06-19 1988-10-19
#1 1987-01-01 0.500 0.560 0.80 0.75 0.100 0.15 0.20 0.18 0.21
#2 1987-07-03 0.589 0.447 NA 0.65 0.550 0.12 0.22 0.21 0.24
#3 1988-01-01 0.660 0.750 0.83 0.75 0.811 0.78 0.32 0.23 0.25
# meanPreFire meanPostFire
#1 0.6200 0.2650000
#2 0.5590 0.1975000
#3 0.7635 0.2666667
Or using melt/dcast from data.table
library(data.table)
dcast(melt(setDT(data1989), id.var = 'date_fire')[,
.(value = mean(value, na.rm = TRUE)),
.(date_fire, grp = c('postFire', 'preFire')[1 + (as.IDate(variable) < as.IDate(date_fire))]) ], date_fire ~ grp)[data1989, on = .(date_fire)]
# date_fire postFire preFire 1986-01-01 1986-06-03 1986-10-19 1987-01-19 1987-06-19 1987-10-19 1988-01-19 1988-06-19
#1: 1987-01-01 0.2650000 0.6200 0.500 0.560 0.80 0.75 0.100 0.15 0.20 0.18
#2: 1987-07-03 0.1975000 0.5590 0.589 0.447 NA 0.65 0.550 0.12 0.22 0.21
#3: 1988-01-01 0.2666667 0.7635 0.660 0.750 0.83 0.75 0.811 0.78 0.32 0.23
# 1988-10-19
#1: 0.21
#2: 0.24
#3: 0.25
data
data1989 <- data.frame("date_fire" = c("1987-01-01", "1987-07-03", "1988-01-01"),
"1986-01-01" = c(0.5, 0.589, 0.66),
"1986-06-03" = c(0.56, 0.447, 0.75),
"1986-10-19" = c(0.8, NA, 0.83),
"1987-01-19" = c(0.75, 0.65,0.75),
"1987-06-19" = c(0.1, 0.55,0.811),
"1987-10-19" = c(0.15, 0.12, 0.780),
"1988-01-19" = c(0.2, 0.22,0.32),
"1988-06-19" = c(0.18, 0.21,0.23),
"1988-10-19" = c(0.21, 0.24, 0.250), check.names = FALSE,
stringsAsFactors = FALSE)

Ordering a subset of columns by date r

I have a data frame which part of the columns are not in the correct order (they are dates). See:
data1989 <- data.frame("date_fire" = c("1987-02-01", "1987-07-03", "1988-01-01"),
"Foresttype" = c("oak", "pine", "oak"),
"meanSolarRad" = c(500, 550, 450),
"meanRainfall" = c(600, 300, 450),
"meanTemp" = c(14, 15, 12),
"1988.01.01" = c(0.5, 0.589, 0.66),
"1986.06.03" = c(0.56, 0.447, 0.75),
"1986.10.19" = c(0.8, NA, 0.83),
"1988.01.19" = c(0.75, 0.65,0.75),
"1986.06.19" = c(0.1, 0.55,0.811),
"1987.10.19" = c(0.15, 0.12, 0.780),
"1988.01.19" = c(0.2, 0.22,0.32),
"1986.06.19" = c(0.18, 0.21,0.23),
"1987.10.19" = c(0.21, 0.24, 0.250),
check.names = FALSE,
stringsAsFactors = FALSE)
> data1989
date_fire Foresttype meanSolarRad meanRainfall meanTemp 1988.01.01 1986.06.03 1986.10.19 1988.01.19 1986.06.19 1987.10.19 1988.01.19 1986.06.19 1987.10.19
1 1987-02-01 oak 500 600 14 0.500 0.560 0.80 0.75 0.100 0.15 0.20 0.18 0.21
2 1987-07-03 pine 550 300 15 0.589 0.447 NA 0.65 0.550 0.12 0.22 0.21 0.24
3 1988-01-01 oak 450 450 12 0.660 0.750 0.83 0.75 0.811 0.78 0.32 0.23 0.25
I would like to order the columns by increasing date, and keep the first 5 columns the same. Keep in mind that in my original dataset I have 30 initial columns to be kept the same.
As commented, try to avoid wide formatted data with columns that contain data elements such as dates, category values, other indicators. Instead use long-formatted, tidy data where ordering is much easier including aggregation, merging, plotting, and modeling.
Specifically, consider reshape to melt dates into one field such as quarter with value. Then order quarter column easily:
# RESHAPE WIDE TO LONG
long_data1989 <- reshape(data1989, varying = names(data1989)[6:ncol(data1989)],
times = names(data1989)[6:ncol(data1989)],
v.names = "value", timevar = "quarter", ids = NULL,
new.row.names = 1:1E4, direction = "long")
# ORDER DATES AND RESET row.names
long_data1989 <- `row.names<-`(with(long_data1989, long_data1989[order(date_fire, quarter),]),
NULL)
long_data1989
Online Demo
If you wanted to use dplyr here is an alternative. Note each colname would have to be unique. In you df there were some duplicate ones
library(dplyr)
data1989 <- data.frame("date_fire" = c("1987-02-01", "1987-07-03", "1988-01-01"),
"Foresttype" = c("oak", "pine", "oak"),
"meanSolarRad" = c(500, 550, 450),
"meanRainfall" = c(600, 300, 450),
"meanTemp" = c(14, 15, 12),
"1988.01.01" = c(0.5, 0.589, 0.66),
"1986.06.03" = c(0.56, 0.447, 0.75),
"1986.10.19" = c(0.8, NA, 0.83),
"1988.01.19" = c(0.75, 0.65,0.75),
"1986.06.19" = c(0.1, 0.55,0.811),
"1987.10.19" = c(0.15, 0.12, 0.780),
# "1988.01.19" = c(0.2, 0.22,0.32),
# "1986.06.19" = c(0.18, 0.21,0.23),
# "1987.10.19" = c(0.21, 0.24, 0.250),
check.names = FALSE,
stringsAsFactors = FALSE)
# Sort date column names. replace 6 with first date column
sorted_colnames = sort(names(data1989)[6:ncol(data1989)])
# Sort columns. Replace 5 with last non-date column
data1989 %>%
select(1:5, sorted_colnames)
We can convert the column names that are dates to Date class, do the order and then use that as column index
i1 <- grep('^\\d{4}\\.\\d{2}\\.\\d{2}$', names(data1989))
data1989[c(seq_len(i1[1]-1), order(as.Date(names(data1989)[i1], "%Y.%m.%d")) + i1[1]-1)]
# date_fire Foresttype meanSolarRad meanRainfall meanTemp 1986.06.03 1986.06.19 1986.06.19.1 1986.10.19 1987.10.19
#1 1987-02-01 oak 500 600 14 0.560 0.100 0.18 0.80 0.15
#2 1987-07-03 pine 550 300 15 0.447 0.550 0.21 NA 0.12
#3 1988-01-01 oak 450 450 12 0.750 0.811 0.23 0.83 0.78
# 1987.10.19.1 1988.01.01 1988.01.19 1988.01.19.1
#1 0.21 0.500 0.75 0.20
#2 0.24 0.589 0.65 0.22
#3 0.25 0.660 0.75 0.32
Base R solution (similar to #Parfaits):
# Reshape dataframe wide --> long:
df_long <-
reshape(data1989,
direction = "long",
varying = which(!(is.na(as.Date(names(data1989), "%Y.%m.%d")))),
idvar = which(is.na(as.Date(names(data1989), "%Y.%m.%d"))),
v.names = "value",
times = na.omit(as.Date(names(data1989), "%Y.%m.%d")),
timevar = "date_surveyed",
new.row.names = 1:(nrow(data1989)*length(na.omit(as.Date(names(data1989),
"%Y.%m.%d")))))
# Order the data frame and reset the index:
ordered_df_long <- data.frame(df_long[with(df_long, order(date_fire, date_surveyed)),],
row.names = NULL)

loops in R, finding the mean for one column depends on another column

So my test data looks like this:
structure(list(day = c(1L, 1L, 2L, 2L, 2L, 3L, 3L, 4L, 4L, 4L
), Left = c(0.25, 0.33, 0, 0, 0.25, 0.33, 0.5, 0.33, 0.5, 0),
Left1 = c(NA, NA, 0, 0.5, 0.25, 0.33, 0.1, 0.33, 0.5, 0),
Middle = c(0, 0, 0.3, 0, 0.25, 0, 0.3, 0.33, 0, 0), Right = c(0.25,
0.33, 0.3, 0.5, 0.25, 0.33, 0.1, 0, 0, 0.25), Right1 = c(0.5,
0.33, 0.3, 0, 0, 0, 0, 0, 0, 0.75), Side = structure(c(2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L), .Label = c("L", "R"), class = "factor")), .Names = c("day",
"Left", "Left1", "Middle", "Right", "Right1", "Side"), class = "data.frame", row.names = c(NA,
-10L))
or this:
day Left Left1 Middle Right Right1 Side
1 0.25 NA 0.00 0.25 0.50 R
1 0.33 NA 0.00 0.33 0.33 R
2 0.00 0.00 0.30 0.30 0.30 R
2 0.00 0.50 0.00 0.50 0.00 R
2 0.25 0.25 0.25 0.25 0.00 L
3 0.33 0.33 0.00 0.33 0.00 L
I would like to write a loop to find the standard error and average value for each day on the chosen side..
Ok.. So far I have this code:
td<-read.csv('test data.csv')
IDs<-unique(td$day)
se<-function(x) sqrt(var(x)/length(x))
for (i in 1:length (IDs)) {
day.i<-which(td$day==IDs[i])
td.i<-td[day.i,]
if(td$Side=='L'){
side<-cbind(td.i$Left + td.i$Left1)
}else{
side<-cbind(td.i$Right + td.i$Right1)
}
mean(side)
se(side)
print(mean)
print(se)
}
But I am getting error messages like this
Error: unexpected '}' in "}"
Obviously, I am also not getting the print out of means for each day.. Does anyone know why?
also working on things here: http://www.talkstats.com/showthread.php/27187-Writing-a-mean-loop..-(literally)
Convert your data into a list and work with that instead:
First, split up your data into a list according to Side, subsetting the relevant columns along the way.
td = split(td, td$Side)
NAMES = names(td)
td = lapply(1:length(td),
function(x) td[[x]][c(1, grep(NAMES[x],
names(td[[x]])))])
names(td) = NAMES
td
# $L
# day Left Left1
# 5 2 0.25 0.25
# 6 3 0.33 0.33
# 7 3 0.50 0.10
# 8 4 0.33 0.33
# 9 4 0.50 0.50
#
# $R
# day Right Right1
# 1 1 0.25 0.50
# 2 1 0.33 0.33
# 3 2 0.30 0.30
# 4 2 0.50 0.00
# 10 4 0.25 0.75
Then, use lapply and aggregate to apply whatever functions you want to your data.
lapply(1:length(td),
function(x) aggregate(list(td[[x]][-1]),
list(day = td[[x]]$day), mean))
# [[1]]
# day Left Left1
# 1 2 0.250 0.250
# 2 3 0.415 0.215
# 3 4 0.415 0.415
#
# [[2]]
# day Right Right1
# 1 1 0.29 0.415
# 2 2 0.40 0.150
# 3 4 0.25 0.750
Still not entirely sure if I understand (that is if you want mean and SE for both Left and Left 1 or some sort of combination like sum). This is how I interpreted your question:
FUN <- function(dat, side = "L") {
DF <- split(dat, dat$Side)[[side]]
ind <- if(side=="L") 2:3 else 5:6
stderr <- function(x) sqrt(var(x)/length(x))
meanNse <- function(x) c(mean=mean(x), se=stderr(x))
OUT <- aggregate(DF[, ind], list(DF[, 1]), meanNse)
names(OUT)[1] <- "day"
return(OUT)
}
#test it
FUN(td)
FUN(td, "R")
Which yields:
> FUN(td)
day Left.mean Left.se Left1.mean Left1.se
1 2 0.250 NA 0.250 NA
2 3 0.415 0.085 0.215 0.115
3 4 0.415 0.085 0.415 0.085
> FUN(td, "R")
day Right.mean Right.se Right1.mean Right1.se
1 1 0.29 0.04 0.415 0.085
2 2 0.40 0.10 0.150 0.150
3 4 0.25 NA 0.750 NA

Resources