Breaking the tapply junkie habit - r

I've learned R by toying, and I'm starting to think that I'm abusing the tapply function. Are there better ways to do some of the following actions? Granted, they work, but as they get more complex I wonder if I'm losing out on better options. I'm looking for some criticism, here:
tapply(var1, list(fac1, fac2), mean, na.rm=T)
tapply(var1, fac1, sum, na.rm=T) / tapply(var2, fac1, sum, na.rm=T)
cumsum(tapply(var1, fac1, sum, na.rm=T)) / sum(var1)
Update: Here's some example data...
var1 var2 fac1 fac2
1 NA 275.54 10 (266,326]
2 NA 565.89 10 (552,818]
3 NA 815.41 6 (552,818]
4 NA 281.77 6 (266,326]
5 NA 640.24 NA (552,818]
6 NA 78.42 NA [78.4,266]
7 NA 1027.06 NA (818,1.55e+03]
8 NA 355.20 NA (326,552]
9 NA 464.52 NA (326,552]
10 NA 1397.11 10 (818,1.55e+03]
11 NA 229.82 NA [78.4,266]
12 NA 542.77 NA (326,552]
13 NA 829.32 NA (818,1.55e+03]
14 NA 284.78 NA (266,326]
15 NA 194.97 10 [78.4,266]
16 NA 672.55 8 (552,818]
17 NA 348.01 10 (326,552]
18 NA 1550.79 9 (818,1.55e+03]
19 101.98 101.98 4 [78.4,266]
20 NA 292.80 6 (266,326]
Update data dump:
structure(list(var1 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, 101.98, NA), var2 = c(275.54,
565.89, 815.41, 281.77, 640.24, 78.42, 1027.06, 355.2, 464.52,
1397.11, 229.82, 542.77, 829.32, 284.78, 194.97, 672.55, 348.01,
1550.79, 101.98, 292.8), fac1 = c(10L, 10L, 6L, 6L, NA, NA, NA,
NA, NA, 10L, NA, NA, NA, NA, 10L, 8L, 10L, 9L, 4L, 6L), fac2 = structure(c(2L,
4L, 4L, 2L, 4L, 1L, 5L, 3L, 3L, 5L, 1L, 3L, 5L, 2L, 1L, 4L, 3L,
5L, 1L, 2L), .Label = c("[78.4,266]", "(266,326]", "(326,552]",
"(552,818]", "(818,1.55e+03]"), class = "factor")), .Names = c("var1",
"var2", "fac1", "fac2"), row.names = c(NA, -20L), class = "data.frame")

For part 1 I prefer aggregate because it keeps the data in a more R-like one observation per row format.
aggregate(var1, list(fac1, fac2), mean, na.rm=T)

Related

Sort rows grouped by grep alphabetically

I have a dataframe with a row full of adverse events but also relationships of these adverse events to the procedure, like this:
df <- data.frame(
adverse_event = c(
"Haemorrhage", "related", "likely related",
"Other", "related", "likely related", "Pain", "related", "likely related",
"Subcapsular hematoma", "related", "likely related", "Ascites",
"related", "likely related", "Hyperbilirubinemia", "related",
"likely related", "Liver abscess", "related", "likely related",
"Pleural effusion with drainage", "related", "likely related",
"Pneumothorax", "related", "likely related", "Biliary leakage / occlusion / fistula",
"related", "likely related", "Portal vein thrombosis", "related",
"likely related", "Sepsis", "related", "likely related"
),
grade_1 = c(
4L, 4L, 0L, 3L, 6L, 1L, 8L, 4L, 5L, 3L, 1L, 3L, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA
),
grade_2 = c(
2L, 3L, 0L, 11L, 3L, 7L, 2L, 4L, 2L, 1L, 2L, 0L, 1L, 1L, 0L,
1L, 0L, 2L, 1L, 1L, 0L, 1L, 2L, 1L, 1L, 1L, 0L, NA, NA, NA, NA,
NA, NA, NA, NA, NA
),
grade_3 = c(
1L, 4L, 1L, 5L, 3L, 2L, 2L, 5L, 1L, NA, NA, NA, NA, NA, NA,
NA, NA, NA, 4L, 5L, 1L, NA, NA, NA, 1L, 1L, 0L, 1L, 2L, 0L, 1L,
1L, 0L, 1L, 1L, 0L
),
grade_4 = c(
2L, 4L, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA
)
)
Now I'd like to sort the adverse events alphabetically but of course take the "related", "likely related" rows with the individual adverse event rows, so I'd like to somehow group them first.
In this example it's always 3 rows, but let's assume it could be sometimes 2, 4 or 5 rows too (all except the adverse event rows containing "related" in the string/name though e.g. 'unlikely related').
I know, I can get the indices of the adverse event rows by
grep('related', df$adverse_event, invert = T) but I'm unsure how to use this to group the rows together before sorting them.
Edit: Beginning of the left column of the desired output:
expected_output_left_column <- data.frame(adverse_event = c(
"Ascites", "related", "likely related",
"Biliary leakage / occlusion / fistula", "related", "likely related" ) )
Thank you!
Another solution using base r and lead function from dplyr
# where start each group
id <- grep('related', df$adverse_event, invert = T)
# size of each group
size <- lead(id) - id
size_of_last_group <- nrow(df) - id[length(id)] + 1
size[length(size)] <- size_of_last_group
# add col with id
df$id <- paste0(rep(df$adverse_event[id], times = size),
df$adverse_event)
# order
df <- df[order(df$id), ]
# remove id
df$id <- NULL
You can do the following:
library(dplyr)
left_join(
df,
df %>%
filter(!grepl('related',adverse_event)) %>%
select(adverse_event) %>%
arrange(adverse_event) %>%
mutate(o = row_number())
) %>%
mutate(o = data.table::nafill(o, "locf")) %>%
arrange(o) %>%
select(-o)
Output:
adverse_event grade_1 grade_2 grade_3 grade_4
1 Ascites NA 1 NA NA
2 related NA 1 NA NA
3 likely related NA 0 NA NA
4 Biliary leakage / occlusion / fistula NA NA 1 NA
5 related NA NA 2 NA
6 likely related NA NA 0 NA
7 Haemorrhage 4 2 1 2
8 related 4 3 4 4
9 likely related 0 0 1 1
10 Hyperbilirubinemia NA 1 NA NA
11 related NA 0 NA NA
12 likely related NA 2 NA NA
13 Liver abscess NA 1 4 NA
14 related NA 1 5 NA
15 likely related NA 0 1 NA
16 Other 3 11 5 NA
17 related 6 3 3 NA
18 likely related 1 7 2 NA
19 Pain 8 2 2 NA
20 related 4 4 5 NA
21 likely related 5 2 1 NA
22 Pleural effusion with drainage NA 1 NA NA
23 related NA 2 NA NA
24 likely related NA 1 NA NA
25 Pneumothorax NA 1 1 NA
26 related NA 1 1 NA
27 likely related NA 0 0 NA
28 Portal vein thrombosis NA NA 1 NA
29 related NA NA 1 NA
30 likely related NA NA 0 NA
31 Sepsis NA NA 1 NA
32 related NA NA 1 NA
33 likely related NA NA 0 NA
34 Subcapsular hematoma 3 1 NA NA
35 related 1 2 NA NA
36 likely related 3 0 NA NA
Note that this uses data.table::nafill().. A full data.table solution is as below:
library(data.table)
setDT(df)
data.table(adverse_event = sort(df[!grepl('related',adverse_event), adverse_event]))[, o:=.I][
df, on="adverse_event"][, o:=nafill(o, "locf")][order(o), !c("o")]
Add a "group" variable and sort
tmp=!grepl("related",df$adverse_event)
df$grp=cumsum(tmp)
df[order(match(df$grp,order(df$adverse_event[tmp]))),]
adverse_event grade_1 grade_2 grade_3 grade_4 grp
13 Ascites NA 1 NA NA 5
14 related NA 1 NA NA 5
15 likely related NA 0 NA NA 5
28 Biliary leakage / occlusion / fistula NA NA 1 NA 10
29 related NA NA 2 NA 10
30 likely related NA NA 0 NA 10
1 Haemorrhage 4 2 1 2 1
2 related 4 3 4 4 1
3 likely related 0 0 1 1 1
16 Hyperbilirubinemia NA 1 NA NA 6
17 related NA 0 NA NA 6
18 likely related NA 2 NA NA 6
19 Liver abscess NA 1 4 NA 7
20 related NA 1 5 NA 7
21 likely related NA 0 1 NA 7
4 Other 3 11 5 NA 2
5 related 6 3 3 NA 2
6 likely related 1 7 2 NA 2
7 Pain 8 2 2 NA 3
8 related 4 4 5 NA 3
9 likely related 5 2 1 NA 3
22 Pleural effusion with drainage NA 1 NA NA 8
23 related NA 2 NA NA 8
24 likely related NA 1 NA NA 8
25 Pneumothorax NA 1 1 NA 9
26 related NA 1 1 NA 9
27 likely related NA 0 0 NA 9
31 Portal vein thrombosis NA NA 1 NA 11
32 related NA NA 1 NA 11
33 likely related NA NA 0 NA 11
34 Sepsis NA NA 1 NA 12
35 related NA NA 1 NA 12
36 likely related NA NA 0 NA 12
10 Subcapsular hematoma 3 1 NA NA 4
11 related 1 2 NA NA 4
12 likely related 3 0 NA NA 4
Just to throw in another tidyverse solution:
library(tidyr)
library(dplyr)
df %>%
mutate(grp = if_else(grepl("related", adverse_event),
NA_character_,
adverse_event)) %>%
fill(grp) %>%
nest(data = -grp) %>%
arrange(grp) %>%
unnest(cols = data) %>%
select(-grp)
# # A tibble: 36 × 5
# adverse_event grade_1 grade_2 grade_3 grade_4
# <chr> <int> <int> <int> <int>
# 1 Ascites NA 1 NA NA
# 2 related NA 1 NA NA
# 3 likely related NA 0 NA NA
# 4 Biliary leakage / occlusion / fistula NA NA 1 NA
# 5 related NA NA 2 NA
# 6 likely related NA NA 0 NA
# 7 Haemorrhage 4 2 1 2
# 8 related 4 3 4 4
# 9 likely related 0 0 1 1
# 10 Hyperbilirubinemia NA 1 NA NA
# 11 related NA 0 NA NA
# 12 likely related NA 2 NA NA
# 13 Liver abscess NA 1 4 NA
# 14 related NA 1 5 NA
# 15 likely related NA 0 1 NA
# 16 Other 3 11 5 NA
# 17 related 6 3 3 NA
# 18 likely related 1 7 2 NA
# 19 Pain 8 2 2 NA
# 20 related 4 4 5 NA
# 21 likely related 5 2 1 NA
# 22 Pleural effusion with drainage NA 1 NA NA
# 23 related NA 2 NA NA
# 24 likely related NA 1 NA NA
# 25 Pneumothorax NA 1 1 NA
# 26 related NA 1 1 NA
# 27 likely related NA 0 0 NA
# 28 Portal vein thrombosis NA NA 1 NA
# 29 related NA NA 1 NA
# 30 likely related NA NA 0 NA
# 31 Sepsis NA NA 1 NA
# 32 related NA NA 1 NA
# 33 likely related NA NA 0 NA
# 34 Subcapsular hematoma 3 1 NA NA
# 35 related 1 2 NA NA
# 36 likely related 3 0 NA NA
Explanation
mutate + fill: Label each adverse_event with the stem, i.e. re-label all related records with the corresponding event above.
Nest all columns, but keep the newly created grp column, which bears the name of the stem adverse event.
Sort the adverse event stems.
Unnest the rows again.
Remove the grp column.
An approach using rank. Using an extended data set with 4 entries for "Ascites".
library(dplyr)
df %>%
mutate(ord = !grepl("related", adverse_event),
grp = cumsum(ord),
Rank = rank(adverse_event[ord])[grp]) %>%
arrange(Rank) %>%
select(-c(ord, grp, Rank))
adverse_event grade_1 grade_2 grade_3 grade_4
1 Ascites NA 1 NA NA
2 related NA 1 NA NA
3 related NA 1 NA NA
4 likely related NA 0 NA NA
5 Biliary leakage / occlusion / fistula NA NA 1 NA
6 related NA NA 2 NA
7 likely related NA NA 0 NA
8 Haemorrhage 4 2 1 2
9 related 4 3 4 4
10 likely related 0 0 1 1
11 Hyperbilirubinemia NA 1 NA NA
12 related NA 0 NA NA
13 likely related NA 2 NA NA
14 Liver abscess NA 1 4 NA
15 related NA 1 5 NA
16 likely related NA 0 1 NA
17 Other 3 11 5 NA
18 related 6 3 3 NA
19 likely related 1 7 2 NA
20 Pain 8 2 2 NA
21 related 4 4 5 NA
22 likely related 5 2 1 NA
23 Pleural effusion with drainage NA 1 NA NA
24 related NA 2 NA NA
25 likely related NA 1 NA NA
26 Pneumothorax NA 1 1 NA
27 related NA 1 1 NA
28 likely related NA 0 0 NA
29 Portal vein thrombosis NA NA 1 NA
30 related NA NA 1 NA
31 likely related NA NA 0 NA
32 Sepsis NA NA 1 NA
33 related NA NA 1 NA
34 likely related NA NA 0 NA
35 Subcapsular hematoma 3 1 NA NA
36 related 1 2 NA NA
37 likely related 3 0 NA NA
extended data
df <- structure(list(adverse_event = c("Haemorrhage", "related", "likely related",
"Other", "related", "likely related", "Pain", "related", "likely related",
"Subcapsular hematoma", "related", "likely related", "Ascites",
"related", "related", "likely related", "Hyperbilirubinemia",
"related", "likely related", "Liver abscess", "related", "likely related",
"Pleural effusion with drainage", "related", "likely related",
"Pneumothorax", "related", "likely related", "Biliary leakage / occlusion / fistula",
"related", "likely related", "Portal vein thrombosis", "related",
"likely related", "Sepsis", "related", "likely related"), grade_1 = c(4L,
4L, 0L, 3L, 6L, 1L, 8L, 4L, 5L, 3L, 1L, 3L, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA), grade_2 = c(2L, 3L, 0L, 11L, 3L, 7L, 2L, 4L,
2L, 1L, 2L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 2L, 1L, 1L, 0L, 1L, 2L,
1L, 1L, 1L, 0L, NA, NA, NA, NA, NA, NA, NA, NA, NA), grade_3 = c(1L,
4L, 1L, 5L, 3L, 2L, 2L, 5L, 1L, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, 4L, 5L, 1L, NA, NA, NA, 1L, 1L, 0L, 1L, 2L, 0L, 1L, 1L,
0L, 1L, 1L, 0L), grade_4 = c(2L, 4L, 1L, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c(NA,
37L), class = "data.frame")
Here is a benchmark of the different suggestions if needed :
library(bench)
library(dplyr)
library(data.table)
library(tidyr)
df <- data.frame(
adverse_event = c(
"Haemorrhage", "related", "likely related",
"Other", "related", "likely related", "Pain", "related", "likely related",
"Subcapsular hematoma", "related", "likely related", "Ascites",
"related", "likely related", "Hyperbilirubinemia", "related",
"likely related", "Liver abscess", "related", "likely related",
"Pleural effusion with drainage", "related", "likely related",
"Pneumothorax", "related", "likely related", "Biliary leakage / occlusion / fistula",
"related", "likely related", "Portal vein thrombosis", "related",
"likely related", "Sepsis", "related", "likely related"
),
grade_1 = c(
4L, 4L, 0L, 3L, 6L, 1L, 8L, 4L, 5L, 3L, 1L, 3L, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA
),
grade_2 = c(
2L, 3L, 0L, 11L, 3L, 7L, 2L, 4L, 2L, 1L, 2L, 0L, 1L, 1L, 0L,
1L, 0L, 2L, 1L, 1L, 0L, 1L, 2L, 1L, 1L, 1L, 0L, NA, NA, NA, NA,
NA, NA, NA, NA, NA
),
grade_3 = c(
1L, 4L, 1L, 5L, 3L, 2L, 2L, 5L, 1L, NA, NA, NA, NA, NA, NA,
NA, NA, NA, 4L, 5L, 1L, NA, NA, NA, 1L, 1L, 0L, 1L, 2L, 0L, 1L,
1L, 0L, 1L, 1L, 0L
),
grade_4 = c(
2L, 4L, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA
)
)
paul_carteron <- function(df){
# where start each group
id <- grep('related', df$adverse_event, invert = T)
# size of each group
size <- lead(id) - id
size_of_last_group <- nrow(df) - id[length(id)] + 1
size[length(size)] <- size_of_last_group
# add col with id
df$id <- paste0(rep(df$adverse_event[id], times = size),
df$adverse_event)
# order
df <- df[order(df$id), ]
# remove id
df$id <- NULL
}
lang_tang_dplyr <- function(df){
left_join(
df,
df %>%
filter(!grepl('related', adverse_event)) %>%
select(adverse_event) %>%
arrange(adverse_event) %>%
mutate(o = row_number())
) %>%
mutate(o = data.table::nafill(o, "locf")) %>%
arrange(o) %>%
select(-o)
}
lang_tang_databable <- function(df) {
setDT(df)
data.table(adverse_event = sort(df[!grepl('related',adverse_event), adverse_event]))[, o:=.I][
df, on="adverse_event"][, o:=nafill(o, "locf")][order(o), !c("o")]
}
andre_wilberg <- function(df){
df %>%
mutate(ord = !grepl("related", adverse_event),
grp = cumsum(ord),
Rank = rank(adverse_event[ord])[grp]) %>%
arrange(Rank) %>%
select(-c(ord, grp, Rank))
}
thotal <- function(df){
df %>%
mutate(grp = if_else(grepl("related", adverse_event),
NA_character_,
adverse_event)) %>%
fill(grp) %>%
nest(data = -grp) %>%
arrange(grp) %>%
unnest(cols = data) %>%
select(-grp)
}
results = bench::mark(
iterations = 1000, check = FALSE, time_unit = "s", filter_gc = FALSE,
paul_carteron = paul_carteron(df),
lang_tang_dplyr = lang_tang_dplyr(df),
lang_tang_databable = lang_tang_databable(df),
andre_wilberg = andre_wilberg(df),
thotal = thotal(df)
)
plot(results)

R: function or similar to sum up number of non-NA values for columns that contain specific characters in large data set [duplicate]

This question already has an answer here:
How many non-NA values in each row for a matrix?
(1 answer)
Closed 2 years ago.
I have a large data set (907 x 1855). I need to count how many follow-ups each patient have had. A follow-up column contain either 1, 2 or NA and a follow-up may be defined as the specific column being !is.na().
There are up to max 20 follow-ups. As you can see, each follow up has the _vX added as suffix where x correspond to the number of follow-up.
Thus, follow-up nr 20 has the very inconvenient RedCapautogenerated column name p$fu_location_v2_v3_v4_v5_v6_v7_v8_v9_v10_v11_v12_v13_v14_v15_v16_v17_v18_v19_v20
> head(p)
fu_location fu_location_v2 fu_location_v2_v3 fu_location_v2_v3_v4 ...
1 1 1 1 1 ...
2 2 2 1 2 ...
3 1 1 1 2 ...
4 2 2 2 2 ...
I need to count the number of !is.na(for column names that contains "fu_location"). I tried mutate(n_fu = sum(!is.na(contains("fu_location")))) but that did not work.
Preferably, the solution is in dplyr. Perhaps a function?
Expected output:
> head(p)
fu_location fu_location_v2 fu_location_v2_v3 fu_location_v2_v3_v4 n_fu
1 1 1 1 1 8
2 2 2 1 2 20
3 1 1 1 2 4
4 2 2 2 2 4
Data
p <- structure(list(fu_location = c(1L, 2L, 1L, 2L), fu_location_v2 = c(1L,
2L, 1L, 2L), fu_location_v2_v3 = c(1L, 1L, 1L, 2L), fu_location_v2_v3_v4 = c(1L,
2L, 2L, 2L), fu_location_v2_v3_v4_v5 = c(2L, 2L, NA, NA), fu_location_v2_v3_v4_v5_v6 = c(1L,
2L, NA, NA), fu_location_v2_v3_v4_v5_v6_v7 = c(2L, 1L, NA, NA
), fu_location_v2_v3_v4_v5_v6_v7_v8 = c(1L, 2L, NA, NA), fu_location_v2_v3_v4_v5_v6_v7_v8_v9 = c(NA,
2L, NA, NA), fu_location_v2_v3_v4_v5_v6_v7_v8_v9_v10 = c(NA,
1L, NA, NA), fu_location_v2_v3_v4_v5_v6_v7_v8_v9_v10_v11 = c(NA,
2L, NA, NA), fu_location_v2_v3_v4_v5_v6_v7_v8_v9_v10_v11_v12 = c(NA,
1L, NA, NA), fu_location_v2_v3_v4_v5_v6_v7_v8_v9_v10_v11_v12_v13 = c(NA,
2L, NA, NA), fu_location_v2_v3_v4_v5_v6_v7_v8_v9_v10_v11_v12_v13_v14 = c(NA,
2L, NA, NA), fu_location_v2_v3_v4_v5_v6_v7_v8_v9_v10_v11_v12_v13_v14_v15 = c(NA,
1L, NA, NA), fu_location_v2_v3_v4_v5_v6_v7_v8_v9_v10_v11_v12_v13_v14_v15_v16 = c(NA,
2L, NA, NA), fu_location_v2_v3_v4_v5_v6_v7_v8_v9_v10_v11_v12_v13_v14_v15_v16_v17 = c(NA,
1L, NA, NA), fu_location_v2_v3_v4_v5_v6_v7_v8_v9_v10_v11_v12_v13_v14_v15_v16_v17_v18 = c(NA,
2L, NA, NA), fu_location_v2_v3_v4_v5_v6_v7_v8_v9_v10_v11_v12_v13_v14_v15_v16_v17_v18_v19 = c(NA,
1L, NA, NA), fu_location_v2_v3_v4_v5_v6_v7_v8_v9_v10_v11_v12_v13_v14_v15_v16_v17_v18_v19_v20 = c(NA,
2L, NA, NA)), row.names = c(NA, -4L), class = "data.frame")
Use rowSums :
library(dplyr)
p %>% mutate(n_fu = rowSums(!is.na(select(., contains('fu_location')))))
Or in base :
p$n_fu <- rowSums(!is.na(p[grep('fu_location', names(p))]))

Calculate weighted average in R dataframe

"f","index","values","lo.80","lo.95","hi.80","hi.95"
"auto.arima",2017-07-31 16:40:00,2.81613884762163,NA,NA,NA,NA
"auto.arima",2017-07-31 16:40:10,2.83441637197378,NA,NA,NA,NA
"auto.arima",2017-07-31 20:39:10,3.18497899649267,2.73259824384436,2.49312233904087,3.63735974914098,3.87683565394447
"auto.arima",2017-07-31 20:39:20,3.16981166809297,2.69309866988864,2.44074205235297,3.64652466629731,3.89888128383297
"ets",2017-07-31 16:40:00,2.93983529828936,NA,NA,NA,NA
"ets",2017-07-31 16:40:10,3.09739640066054,NA,NA,NA,NA
"ets",2017-07-31 20:39:10,3.1951571771414,2.80966705285567,2.60560090776504,3.58064730142714,3.78471344651776
"ets",2017-07-31 20:39:20,3.33876776870274,2.93593322313957,2.72268549604222,3.7416023142659,3.95485004136325
"bats",2017-07-31 16:40:00,2.82795253090081,NA,NA,NA,NA
"bats",2017-07-31 16:40:10,2.96389759682623,NA,NA,NA,NA
"bats",2017-07-31 20:39:10,3.1383560278272,2.76890864400062,2.573335012715,3.50780341165378,3.7033770429394
"bats",2017-07-31 20:39:20,3.3561357998535,2.98646195085452,2.79076843614824,3.72580964885248,3.92150316355876
I have a dataframe like above which has column names as:"f","index","values","lo.80","lo.95","hi.80","hi.95".
What I want to do is calculate the weighted average on forecast results from different models for a particular timestamp. By this what i mean is
For every row in auto.arima there is a corresponding row in ets and bats with the same timestamp value, so weighted average should be calculated something like this:
value_arima*1/3 + values_ets*1/3 + values_bats*1/3 ; similary values for lo.80 and other columns should be calculated.
This result should be stored in a new dataframe with all the weighted average values.
New dataframe can look something like:
index(timesamp from above dataframe),avg,avg_lo_80,avg_lo_95,avg_hi_80,avg_hi_95
I think I need to use spread() and mutate () function to achieve this. Being new to R I'm unable to proceed after forming this dataframe.
Please help.
The example you provide is not a weighted average but a simple average.
What you want is a simple aggregate.
The first part is your dataset as provided by dput (better for sharing here)
d <- structure(list(f = structure(c(1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L,
2L, 2L, 2L, 2L), .Label = c("auto.arima", "bats", "ets"), class = "factor"),
index = structure(c(1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L,
3L, 4L), .Label = c("2017-07-31 16:40:00", "2017-07-31 16:40:10",
"2017-07-31 20:39:10", "2017-07-31 20:39:20"), class = "factor"),
values = c(2.81613884762163, 2.83441637197378, 3.18497899649267,
3.16981166809297, 2.93983529828936, 3.09739640066054, 3.1951571771414,
3.33876776870274, 2.82795253090081, 2.96389759682623, 3.1383560278272,
3.3561357998535), lo.80 = c(NA, NA, 2.73259824384436, 2.69309866988864,
NA, NA, 2.80966705285567, 2.93593322313957, NA, NA, 2.76890864400062,
2.98646195085452), lo.95 = c(NA, NA, 2.49312233904087, 2.44074205235297,
NA, NA, 2.60560090776504, 2.72268549604222, NA, NA, 2.573335012715,
2.79076843614824), hi.80 = c(NA, NA, 3.63735974914098, 3.64652466629731,
NA, NA, 3.58064730142714, 3.7416023142659, NA, NA, 3.50780341165378,
3.72580964885248), hi.95 = c(NA, NA, 3.87683565394447, 3.89888128383297,
NA, NA, 3.78471344651776, 3.95485004136325, NA, NA, 3.7033770429394,
3.92150316355876)), .Names = c("f", "index", "values", "lo.80",
"lo.95", "hi.80", "hi.95"), class = "data.frame", row.names = c(NA,
-12L))
> aggregate(d[,3:7], by = d["index"], FUN = mean)
index values lo.80 lo.95 hi.80 hi.95
1 2017-07-31 16:40:00 2.861309 NA NA NA NA
2 2017-07-31 16:40:10 2.965237 NA NA NA NA
3 2017-07-31 20:39:10 3.172831 2.770391 2.557353 3.575270 3.788309
4 2017-07-31 20:39:20 3.288238 2.871831 2.651399 3.704646 3.925078
You can save this output in an object and change the column names as you want.
If you really want a weighted average this is a way to obtain it (here bat has a weight of 0.8 and the 2 others 0.1) :
> d$weight <- (d$f)
> levels(d$weight) # check the levels
[1] "auto.arima" "bats" "ets"
> levels(d$weight) <- c(0.1, 0.8, 0.1)
> # transform the factor into numbers
> # warning as.numeric(d$weight) is not correct !!
> d$weight <- as.numeric(as.character((d$weight)))
>
> # Here the result is saved in a data.frame called "result
> result <- aggregate(d[,3:7] * d$weight, by = d["index"], FUN = sum)
> result
index values lo.80 lo.95 hi.80 hi.95
1 2017-07-31 16:40:00 2.837959 NA NA NA NA
2 2017-07-31 16:40:10 2.964299 NA NA NA NA
3 2017-07-31 20:39:10 3.148698 2.769353 2.568540 3.528043 3.728857
4 2017-07-31 20:39:20 3.335767 2.952073 2.748958 3.719460 3.922576

assigning current values towards function for energetics

Good afternoon all.
I have a simple hypothetical dataset of 2 male runners, both weigh 70kg and have 100,000 kcal of energy available. Both runners ran a 13 day race with varying daily distances (km).
Because cost of locomotion declines as one loses mass, I need to recalculate energy expended based on body mass at the end of the previous day.
For example, 70 kg runner loses 0.5 kg during day 1. The runner now weighs 69.5 kg and will have a different rate of energy expenditure during day 2.
Here's the data - I manually entered expected values for the first 2 days of each runner.
structure(list(ID = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L), .Label = c("Male1", "Male2"), class = "factor"), Day = c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L), Km = c(7L, 9L, 15L,
11L, 5L, 15L, 25L, 15L, 12L, 11L, 9L, 8L, 7L, 1L, 2L, 6L, 8L,
15L, 9L, 15L, 12L, 1L, 25L, 2L, 3L, 14L), Kcal = c(328.91, 410.24,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 46.99, 93.97, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), NewMass = c(66.96, 66.91,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 69.995, 69.984, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), .Names = c("ID", "Day",
"Km", "Kcal", "NewMass"), class = "data.frame", row.names = c(NA,
-26L))
I first defined a function for the cost of locomotion (kcal/kg)
costkm <- function(kg) {
(2.57 * (kg ^ -0.316)) * kg #formula to calculate cost of locomotion
}
Then I tried to write code for each individual (ID) and each day run (nrow)
followed by distance (KM) ran during each day for both runners times the locomotion formula followed by code for updated mass at end of the day (9,000 kcal per kg). I cannot "visualize" how to structure the codes...any help would be appreciated!
for (i in unique (df$ID)){
for (j in 1:nrow(i))
df$Kcal<-df$Km * costkm #calculate kcal expended based on current weight
NewMass<- lag(NewMass) - (Kcal/9000) #calculate updated mass
}
}
First, use split to divide up your dataframe into two dataframes (one for each runner):
dfs <- split(df, df$ID)
which gives you a list with two elements:
str(dfs)
# $ Male1:'data.frame': 13 obs. of 5 variables:
# ..$ ID : Factor w/ 2 levels "Male1","Male2": 1 1 1 1 1 1 1 1 1 1 ...
# ..$ Day : int [1:13] 1 2 3 4 5 6 7 8 9 10 ...
# ..$ Km : int [1:13] 7 9 15 11 5 15 25 15 12 11 ...
# ..$ Kcal : num [1:13] 329 410 NA NA NA ...
# ..$ NewMass: num [1:13] 67 66.9 NA NA NA ...
# $ Male2:'data.frame': 13 obs. of 5 variables:
# ..$ ID : Factor w/ 2 levels "Male1","Male2": 2 2 2 2 2 2 2 2 2 2 ...
# ..$ Day : int [1:13] 1 2 3 4 5 6 7 8 9 10 ...
# ..$ Km : int [1:13] 1 2 6 8 15 9 15 12 1 25 ...
# ..$ Kcal : num [1:13] 47 94 NA NA NA ...
# ..$ NewMass: num [1:13] 70 70 NA NA NA ...
Now, you can use lapply to iterate through the list, and iterate through the rows for each dataframe:
output <- lapply(dfs, function(df) {
for (i in 2:nrow(df)) {
df$Kcal[i] <- df$Km[i] * costkm(df$NewMass[i - 1])
df$NewMass[i] <- df$NewMass[i - 1] - (df$Kcal[i] / 9000)
}
return(df)
})
output <- do.call(rbind, output) # Bind the dataframes together again
This gives you
output
# ID Day Km Kcal NewMass
# Male1.1 Male1 1 7 328.91000 66.96000
# Male1.2 Male1 2 9 410.23565 66.91442
# Male1.3 Male1 3 15 683.40769 66.83848
# <snip>
# Male2.14 Male2 1 1 46.99000 69.99500
# Male2.15 Male2 2 2 93.96994 69.98456
# Male2.16 Male2 3 6 281.88106 69.95324
# <snip>

R: Replacing a factor with an integer value in numerous cells across numerous columns

So, my challenge has been to convert a raw scale csv to a scored csv. Within numerous columns, the file has cells filled with "Strongly Agree" to "Strongly Disagree", 6 levels. These factors need to be converted in integers 5 to 0 respectively.
I have tried unsuccessfully to use sapply and convert the table to a string. Sapply works on the vector, but it destroys the table structure.
Method 1:
dat$Col<-sapply(dat$Col,switch,'Strongly Disagree'=0,'Disagree'=1,'Slightly Disagree'=2,'Slightly Agree'=3,'Agree'=4, 'Strongly Agree'=5)
My second approach is to convert the csv into a string. When I examined the dput output, I saw the area I wanted to target that started with a .Label="","Strongly Agree"... Mistake. My changes did not result in a useful outcome.
My third approach came from the internet gods of destruction who seemed to express that gsub() might handle the string approach as well. Nope, again the underlying table structure was destroyed.
Method #3: Convert into a string and pattern match
dat <- textConnection("control/Surveys/StudyDat_1.csv")
#Score Scales
##"Strongly Agree"= 5
##"Agree"= 4
##"Strongly Disagree" = 0
#levels(dat$Col) <- gsub("Strongly Agree", "5", levels(dat$Col))
df<- gsub("Strongly Agree", "5",dat)
dat<-read.csv(textConnection(df),header=TRUE)
In the end, I am wanting to replace ALL "Strongly Agree" to 5 across numerous columns without the consequence of destroying the retrievability of the data.
Maybe I used the wrong search string and you know the resource I need to address this problem. I would rather avoid ALL character vector approaches as that this would require labeling each column if you provide a code response. It will need to go across ALL COLUMNS.
Thanks
Data Sample Problem
structure(list(last_updated = structure(c(3L, 1L, 7L, 2L, 10L, 6L, 8L, 9L, 7L, 5L, 4L), .Label = c("2016-05-13T12:53:56.704184Z",
"2016-05-13T12:54:09.273359Z", "2016-05-13T12:54:22.757251Z",
"2016-05-14T12:44:13.474992Z", "2016-05-14T12:44:31.736469Z",
"2016-05-16T16:45:10.623410Z", "2016-05-16T16:46:17.881402Z",
"2016-05-16T16:46:55.122257Z", "2016-05-16T16:47:14.160793Z",
"2016-05-24T02:26:04.770799Z"), class = "factor"), feedback = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), A = structure(c(NA,
NA, 2L, NA, 1L, NA, NA, NA, 2L, NA, NA), .Label = c("", "Slightly Disagree"
), class = "factor"), B = structure(c(NA, NA, 2L, NA, 1L, NA,
NA, NA, 3L, NA, NA), .Label = c("", "Disagree", "Strongly Agree"
), class = "factor"), C = structure(c(NA, NA, 2L, NA, 1L, NA,
NA, NA, 3L, NA, NA), .Label = c("", "Agree", "Disagree"), class = "factor"),
D = structure(c(NA, NA, 2L, NA, 1L, NA, NA, NA, 2L, NA, NA
), .Label = c("", "Agree"), class = "factor"), E = structure(c(NA,
NA, 2L, NA, 1L, NA, NA, NA, 3L, NA, NA), .Label = c("", "Agree",
"Strongly Disagree"), class = "factor")), .Names = c("last_updated",
"feedback", "A", "B", "C", "D", "E"), class = "data.frame", row.names = c(NA,
-11L))
Data Sample Solution
df<-dget(structure(list(last_updated = structure(c(3L, 1L, 7L, 2L, 10L, 6L,8L, 9L, 7L, 5L, 4L), .Label = c("2016-05-13T12:53:56.704184Z",
"2016-05-13T12:54:09.273359Z", "2016-05-13T12:54:22.757251Z",
"2016-05-14T12:44:13.474992Z", "2016-05-14T12:44:31.736469Z",
"2016-05-16T16:45:10.623410Z", "2016-05-16T16:46:17.881402Z",
"2016-05-16T16:46:55.122257Z", "2016-05-16T16:47:14.160793Z",
"2016-05-24T02:26:04.770799Z"), class = "factor"), feedback = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), A = c(NA, NA, 2L, NA,
NA, NA, NA, NA, 2L, NA, NA), B = c(NA, NA, 1L, NA, NA, NA, NA,
NA, 5L, NA, NA), C = c(NA, NA, 4L, NA, NA, NA, NA, NA, 1L, NA,
NA), D = c(NA, NA, 4L, NA, NA, NA, NA, NA, 4L, NA, NA), E = c(NA,
NA, 4L, NA, NA, NA, NA, NA, 0L, NA, NA)), .Names = c("last_updated",
"feedback", "A", "B", "C", "D", "E"), class = "data.frame", row.names = c(NA,-11L)))
we can use factor with levels specified
nm1 <- c('Strongly Disagree', 'Disagree',
'Slightly Disagree','Slightly Agree','Agree', 'Strongly Agree')
factor(dat$col, levels = nm1,
labels = 0:5))
If there are multiple factor columns with the same levels, identify the factor columns ('i1'), loop through it with lapply and specify the levels and labels.
i1 <- sapply(dat, is.factor)
dat[i1] <- lapply(dat[i1], factor, levels = nm1, labels= 0:5)
Update
Using the OP's dput output
dat[-(1:2)] <- lapply(dat[-(1:2)], factor, levels = nm1, labels = 0:5)
dat
# last_updated feedback A B C D E
#1 2016-05-13T12:54:22.757251Z NA <NA> <NA> <NA> <NA> <NA>
#2 2016-05-13T12:53:56.704184Z NA <NA> <NA> <NA> <NA> <NA>
#3 2016-05-16T16:46:17.881402Z NA 2 1 4 4 4
#4 2016-05-13T12:54:09.273359Z NA <NA> <NA> <NA> <NA> <NA>
#5 2016-05-24T02:26:04.770799Z NA <NA> <NA> <NA> <NA> <NA>
#6 2016-05-16T16:45:10.623410Z NA <NA> <NA> <NA> <NA> <NA>
#7 2016-05-16T16:46:55.122257Z NA <NA> <NA> <NA> <NA> <NA>
#8 2016-05-16T16:47:14.160793Z NA <NA> <NA> <NA> <NA> <NA>
#9 2016-05-16T16:46:17.881402Z NA 2 5 1 4 0
#10 2016-05-14T12:44:31.736469Z NA <NA> <NA> <NA> <NA> <NA>
#11 2016-05-14T12:44:13.474992Z NA <NA> <NA> <NA> <NA> <NA>
Another option is set from data.table
library(data.table)
for(j in names(dat)[-(1:2)]){
set(dat, i = NULL, j= j, value = factor(dat[[j]], levels = nm1, labels = 0:5))
}
I would just match each target column vector into a precomputed character vector to get an integer index. You can subtract 1 afterward to change the range from 1:6 to 0:5.
## define desired value order, ascending
o <- c(
'Strongly Disagree',
'Disagree',
'Slightly Disagree',
'Slightly Agree',
'Agree',
'Strongly Agree'
);
## convert target columns
for (cn in names(df)[-(1:2)]) df[[cn]] <- match(as.character(df[[cn]]),o)-1L;
df;
## last_updated feedback A B C D E
## 1 2016-05-13T12:54:22.757251Z NA NA NA NA NA NA
## 2 2016-05-13T12:53:56.704184Z NA NA NA NA NA NA
## 3 2016-05-16T16:46:17.881402Z NA 2 1 4 4 4
## 4 2016-05-13T12:54:09.273359Z NA NA NA NA NA NA
## 5 2016-05-24T02:26:04.770799Z NA NA NA NA NA NA
## 6 2016-05-16T16:45:10.623410Z NA NA NA NA NA NA
## 7 2016-05-16T16:46:55.122257Z NA NA NA NA NA NA
## 8 2016-05-16T16:47:14.160793Z NA NA NA NA NA NA
## 9 2016-05-16T16:46:17.881402Z NA 2 5 1 4 0
## 10 2016-05-14T12:44:31.736469Z NA NA NA NA NA NA
## 11 2016-05-14T12:44:13.474992Z NA NA NA NA NA NA
Previous answers might meet your needs, but note that changing the labels of a factor isn't the same as changing a factor to an integer variable. One possibility would be to use ifelse (I've made a new data frame as the one you posted didn't actually have variables with these levels in it):
lev <- c('Strongly disagree', 'Disagree', 'Slightly disagree', 'Slightly agree', 'Agree', 'Strongly agree')
dta <- sample(lev, 55, replace = TRUE)
dta <- data.frame(matrix(dta, nrow = 11))
names(dta) <- LETTERS[1:5]
f_to_int <- function(f) {
if (is.factor(f)){
ifelse(f == 'Strongly disagree', 0,
ifelse(f == 'Disagree', 1,
ifelse(f == 'Slightly disagree', 2,``
ifelse(f == 'Slightly agree', 3,
ifelse(f == 'Agree', 4,
ifelse(f == 'Strongly agree', 5, f))))))
} else f
}
dta2 <- sapply(dta, f_to_int)
Note that this returns a matrix, but it is easily converted to a data frame if necessary.

Resources