dplyr: how to use multiple variables in mutate_at funs

dplyr: how to use multiple variables in mutate_at funs - r

I'm trying to generalize this chunk of code:
trimmedMeans %>%
mutate(Expectation_mean = paste(format(Expectation_mean, digits = 2, nsmall = 2),
"±",
format(Expectation_sd, digits = 2, nsmall = 2)),
Interesting_mean = paste(format(Interesting_mean, digits = 2, nsmall = 2),
"±",
format(Interesting_sd, digits = 2, nsmall = 2)),
Useful_mean = paste(format(Useful_mean, digits = 2, nsmall = 2),
"±",
format(Useful_sd, digits = 2, nsmall = 2)),
OralPresentation_mean = paste(format(OralPresentation_mean, digits = 2, nsmall = 2),
"±",
format(OralPresentation_sd, digits = 2, nsmall = 2))
)
I'm trying to do this:
paste.Mean.Sd <- function(m, s){
paste(format(m, digits = 2, nsmall = 2),
"±",
format(s, digits = 2, nsmall = 2)) }
trimmedMeans2 <- trimmedMeans %>%
mutate_at(vars(contains('_mean')), funs(paste.Mean.Sd(
vars(contains('_mean')), vars(contains('_sd'))
)) )
What I'm getting is something like this:
What I expected to have is this:
What am I missing?
EDIT 1
This code gives me the right result for the "left part" (mean) of the string, not for the SD part:
trimmedMeans %>%
mutate_at(vars(contains('_mean')), funs(paste.Mean.Sd(., str_replace(., "_mean", "_sd"))))
EDIT 2
The following is the code to reproduce the dataframe I used:
trimmedMeans <- structure(list(TrackName = structure(c(2L, 2L, 2L, 2L, 2L, 2L
), .Label = c("Llytse", "Mneshe", "Phrypa", "Veormi"), class = "factor"),
SpeakerName = c("Delta Shelby", "Irvine Fairburn", "Kristine Harland",
"Paislee Jež", "Rhianna Clarke", "Spencer Hargrave"), NumOfVoters = c(15L,
14L, 5L, 14L, 17L, 19L), Expectation_mean = c(4.6, 5, 4.2,
4.07142857142857, 4.41176470588235, 4.73684210526316), Interesting_mean = c(4.46666666666667,
5.5, 5, 4.78571428571429, 5.05882352941176, 5.57894736842105
), Useful_mean = c(4.6, 5.14285714285714, 4.6, 4.28571428571429,
4.52941176470588, 5.42105263157895), OralPresentation_mean = c(4.33333333333333,
5.28571428571429, 5.4, 4.85714285714286, 5.17647058823529,
5.52631578947368), Expectation_sd = c(0.736788397613007,
0.784464540552736, 0.836660026534076, 0.474631146549323,
0.870260272089029, 0.561951486949016), Interesting_sd = c(0.639940473422184,
0.518874521662771, 0.707106781186548, 0.801783725737273,
0.747545001596402, 0.507257273501788), Useful_sd = c(0.9102589898328,
1.02710518202619, 0.894427190999916, 0.913873533463375, 1.06757008311068,
0.507257273501788), OralPresentation_sd = c(0.975900072948533,
0.825420305855557, 0.547722557505166, 0.864437821507567,
0.63593377383646, 0.611775290321498)), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -6L), vars = c("TrackName",
"SpeakerName"), drop = TRUE, indices = list(0L, 1L, 2L, 3L, 4L,
5L), group_sizes = c(1L, 1L, 1L, 1L, 1L, 1L), biggest_group_size = 1L, labels = structure(list(
TrackName = structure(c(2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Llytse",
"Mneshe", "Phrypa", "Veormi"), class = "factor"), SpeakerName = c("Delta Shelby",
"Irvine Fairburn", "Kristine Harland", "Paislee Jež", "Rhianna Clarke",
"Spencer Hargrave")), class = "data.frame", row.names = c(NA,
-6L), vars = c("TrackName", "SpeakerName"), drop = TRUE, .Names = c("TrackName",
"SpeakerName")), .Names = c("TrackName", "SpeakerName", "NumOfVoters",
"Expectation_mean", "Interesting_mean", "Useful_mean", "OralPresentation_mean",
"Expectation_sd", "Interesting_sd", "Useful_sd", "OralPresentation_sd"
))

I found your approach to be challenging, even after reading the Programming with dplyr vignette. Instead, I used tidyr to gather() and spread() the data to get your desired result, which was more intuitive to me.
library(tidyr)
trimmedMeans %>%
gather(key, value, -TrackName, -SpeakerName, -NumOfVoters) %>%
mutate_at('value', format, digits = 2, nsmall = 2) %>%
separate(key, c('var', 'key')) %>%
group_by(SpeakerName, var) %>%
spread(key, value) %>%
group_by(SpeakerName) %>%
unite(value, mean, sd, sep = " ± ") %>%
mutate(var = paste0(var, "_sd")) %>%
spread(var, value)
# A tibble: 6 x 7
# Groups: SpeakerName [6]
TrackName SpeakerName NumOfVoters Expectation_sd Interesting_sd
<fct> <chr> <int> <chr> <chr>
1 Mneshe Delta Shel… 15 4.60 ± 0.74 4.47 ± 0.64
2 Mneshe Irvine Fai… 14 5.00 ± 0.78 5.50 ± 0.52
3 Mneshe Kristine H… 5 4.20 ± 0.84 5.00 ± 0.71
4 Mneshe Paislee Jež 14 4.07 ± 0.47 4.79 ± 0.80
5 Mneshe Rhianna Cl… 17 4.41 ± 0.87 5.06 ± 0.75
6 Mneshe Spencer Ha… 19 4.74 ± 0.56 5.58 ± 0.51
# ... with 2 more variables: OralPresentation_sd <chr>,
# Useful_sd <chr>

I solved in the meantime with this trick:
for (characteristic in speaker.characteristcs) {
characteristic_str <- paste0(characteristic, "_str")
trimmedMeans[characteristic_str] <-
trimmedMeans %>% ungroup() %>% select( contains(characteristic) ) %>%
tidyr::unite()
}
paste.Mean.Sd <- function(s){
paste(format(as.numeric(strsplit(s, "\\_")[[1]][1]), digits = 2, nsmall = 2),
"±",
format(as.numeric(strsplit(s, "\\_")[[1]][2]), digits = 2, nsmall = 2)) }
trimmedMeans %>%
mutate_at(vars(contains('_str')),
funs(paste.Mean.Sd(.))) %>%
ungroup() %>%
select(SpeakerName, NumOfVoters, contains('_str')) %>%
I don't know if it's possible to get the result with a single statement, using dplyr programming features.

Related

Finding mean of variable across each month/year

I have a dataset that looks similar to this:
> dput(df)
structure(list(Date = c("3/23/21", "4/11/22", "6/30/22"), Banana_wasted = c(4L,
2L, 5L), Apple_wasted = c(6L, 0L, 3L), Orange_wasted = c(1L,
4L, 1L), Banana_ordered = c(5L, 7L, 7L), Apple_Ordered = c(9L,
8L, 9L), Orange_ordered = c(5L, 6L, 6L), Banana_eaten = c(5L,
5L, 6L), Apple_eaten = c(7L, 7L, 4L), Orange_eaten = c(8L, 8L,
8L)), class = "data.frame", row.names = c(NA, -3L))
I want to find the % of fruit wasted per month/year (in relation to how many fruits were ordered).
it should be:
(Banana_wasted+Apple_wasted+Orange_wasted) / (Banana_ordered + Apple_ordered+ Orange_ordered)
So, for 3/21, it should be:
(4+6+1/5+9+5)*100 = 57.9%
I would like to do this for every month of the year.

library(tidyverse)
df %>%
group_by(Date = floor_date(mdy(Date), "month")) %>%
summarise(
wasted = sum(across(contains("wasted"))) / sum(across(contains("ordered"))),
wasted_eaten = sum(across(contains("wasted"))) / sum(across(contains("eaten")))
)
# A tibble: 3 x 3
Date wasted wasted_eaten
<date> <dbl> <dbl>
1 2021-03-01 0.579 0.579
2 2022-04-01 0.286 0.314
3 2022-06-01 0.409 0.523

library(dplyr)
library(lubridate)
df %>%
mutate(Date = as.Date(Date, format = "%m/%d/%y"),
pct_wasted = (Banana_wasted + Apple_wasted + Orange_wasted) / (Banana_ordered + Apple_Ordered + Orange_ordered) * 100) %>%
group_by(year = year(Date), month = month(Date)) %>%
summarize(avg_pct_wasted = mean(pct_wasted))
#> # A tibble: 3 × 3
#> # Groups: year [2]
#> year month avg_pct_wasted
#> <dbl> <dbl> <dbl>
#> 1 2021 3 57.9
#> 2 2022 4 28.6
#> 3 2022 6 40.9
Created on 2023-02-06 with reprex v2.0.2

Pivot longer to get single wasted and ordered columns across all fruits; use lubridate::floor_date() and mdy() to get months from Date; group by month; then sum and divide to get your percentages:
library(dplyr)
library(tidyr)
library(lubridate)
dat %>%
rename(Apple_ordered = Apple_Ordered) %>% # for consistent capitalization
pivot_longer(
Banana_wasted:Orange_eaten,
names_to = c("Fruit", ".value"),
names_sep = "_"
) %>%
group_by(month = floor_date(mdy(Date), "month")) %>%
summarize(pct_wasted = sum(wasted) / sum(ordered)) %>%
ungroup()
# # A tibble: 3 × 2
# month pct_wasted
# <date> <dbl>
# 1 2021-03-01 0.579
# 2 2022-04-01 0.286
# 3 2022-06-01 0.409
If you prefer character labels, use strftime() instead of floor_date(), and scales::percent() for the percentages:
library(scales)
dat %>%
rename(Apple_ordered = Apple_Ordered) %>%
pivot_longer(
Banana_wasted:Orange_eaten,
names_to = c("Fruit", ".value"),
names_sep = "_"
) %>%
group_by(month = strftime(mdy(Date), "%B %Y")) %>%
summarize(pct_wasted = percent(sum(wasted) / sum(ordered), accuracy = 0.1)) %>%
ungroup()
# # A tibble: 3 × 2
# month pct_wasted
# <chr> <chr>
# 1 April 2022 28.6%
# 2 June 2022 40.9%
# 3 March 2021 57.9%

Aggregate columns based on categories given by another dataframe

I have a dataframe where each column has some vector of data. I want to apply the mean columnwise, but filtered by groups which are given by a second dataframe. That is, each column belongs to a group and this information is in the second dataframe.
Here is some example dataset: df is the dataframe with the data vectors, df_category contains the category for each column.
df=structure(list(x1 = c(0.461302090047301, -1.19974381763812, -0.888258056235799,
0.300889698419314, 0.836911163114131, 0.0540388337324712), x2 = c(1.33736696170763,
-0.687026295689823, 1.12205295626651, -0.848925266014684, 1.16092168555067,
0.591202293337843), x3 = c(-0.279052669225263, -0.780435476613128,
-0.852870619718068, -0.708611614262357, -0.761659405740852, 0.487033696695474
), x4 = c(-0.222767493777229, 1.50328295132467, 0.934670132217215,
1.37678188537077, 0.343280062984192, 1.23279081824003), x5 = c(-1.08074586121729,
0.208120194894818, -0.52245832008453, 0.944618465137011, 0.749834485631317,
-0.81118414509141)), class = "data.frame", row.names = c(NA,
-6L))
df_category=structure(list(Col_name = structure(1:5, .Label = c("x1", "x2",
"x3", "x4", "x5"), class = "factor"), Category = structure(c(1L,
1L, 2L, 2L, 2L), .Label = c("A", "B"), class = "factor")), class = "data.frame", row.names = c(NA,
-5L))
The result I want is this one:
df_result=structure(list(mean_A = c(0.899334525877468, -0.943385056663974,
0.116897450015357, -0.274017783797685, 0.998916424332403, 0.322620563535157
), mean_B = c(-0.527522008073261, 0.310322556535454, -0.146886269195128,
0.537596245415141, 0.110485047624885, 0.302880123281364)), class = "data.frame", row.names = c(NA,
-6L))

in Base R:
a <- with(df_category, setNames(Category, Col_name))[names(df)[col(df)]]
tapply(unlist(df), list(row(df), a), mean)
A B
1 0.8993345 -0.5275220
2 -0.9433851 0.3103226
3 0.1168975 -0.1468863
4 -0.2740178 0.5375962
5 0.9989164 0.1104850
6 0.3226206 0.3028801
Another option:
sapply(with(df_category, split.default(df[Col_name], Category)), rowMeans)
A B
[1,] 0.8993345 -0.5275220
[2,] -0.9433851 0.3103226
[3,] 0.1168975 -0.1468863
[4,] -0.2740178 0.5375962
[5,] 0.9989164 0.1104850
[6,] 0.3226206 0.3028801

We can use tidyverse to reshape the data values, merge the category data, and compute means for groups "A" and "B":
library(tidyverse)
df_result <- df %>%
mutate(idx = row_number()) %>%
pivot_longer(-idx) %>%
inner_join(df_category, c(name = 'Col_name')) %>%
group_by(Category, idx) %>%
summarize(mean = mean(value)) %>%
pivot_wider(names_from = Category, values_from = mean, names_prefix = 'mean_') %>%
select(-idx)
mean_A mean_B
<dbl> <dbl>
1 0.899 -0.528
2 -0.943 0.310
3 0.117 -0.147
4 -0.274 0.538
5 0.999 0.110
6 0.323 0.303

Filtering multiple, specific columns for multiple time ranges

I'm QCing data and for several tanks/data_types there are faulty data that need to be removed, spanning multiple time ranges. The data_types, tanks and time ranges that contain faulty data have been reported in a separate data frame, a snippet of which is contained in this QC_table:
structure(list(trial = c(1L, 1L, 1L, 1L, 2L, 2L), data_type = c("Temp",
"pH", "pH", "pH", "Temp", "Temp"), tank = c("29", "40", "40",
"40", "13", "29"), date_time_start = c("2021-03-31 8:30", "2021-03-31 7:50",
"2021-03-31 10:25", "2021-03-31 17:05", "2021-04-07 10:25", "2021-04-08 10:30"
), date_time_end = c("2021-03-31 18:00", "2021-03-31 8:15", "2021-03-31 10:40",
"2021-03-31 17:30", "2021-04-07 17:20", "2021-04-10 18:25"),
to.be.removed = c("yes ", "yes ", "yes ", "yes ", "yes ",
"yes "), reason = c("calibration error", "faulty probe",
"faulty probe", "faulty probe", "calibration error", "faulty probe"
), data_type_tank = c("WalchemTempTank29", "pH_Tank40", "pH_Tank40",
"pH_Tank40", "WalchemTempTank13", "WalchemTempTank29")), row.names = c(NA,
-6L), class = "data.frame")
There are additional trials and tanks to this. My approach to this was to create a new dataframe with all the data that needs to be removed (based on data_type_tank and date_time_start/end columns in the QC table), and then remove that data frame from the original dataframe. I don't know if this is the most logical, but I wouldn't know how I would be able to remove the data from the original dataframe.
I construct a new dataframe using:
new_dataframe <- dataframe %>%
select(c(Measurement.time, Trial, contains(urchin_temp_pH_QC$data_type_tank))) %>% head(10)
structure(list(Measurement.time = c("2021-03-30 11:00", "2021-03-30 11:05",
"2021-03-30 11:10", "2021-03-30 11:15", "2021-03-30 11:20", "2021-03-30 11:25",
"2021-03-30 11:30", "2021-03-30 11:35", "2021-03-30 11:40", "2021-03-30 11:45"
), Trial = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), WalchemTempTank29_Avg = c("18.8",
"18.67", "18.58", "18.48", "18.38", "18.29", "18.2", "18.12",
"18.03", "18"), WalchemTempTank29_Std = c("0.037", "0.025", "0.032",
"0.029", "0.017", "0.018", "0.026", "0.025", "0.024", "0.023"
), pH_Tank40_Avg = c("7.859", "7.863", "7.868", "7.866", "7.863",
"7.864", "7.865", "7.867", "7.869", "7.87"), pH_Tank40_Std = c("0.007",
"0.006", "0.002", "0.001", "0.002", "0.001", "0.002", "0.001",
"0.002", "0.004"), WalchemTempTank13_Avg = c("10.26", "10.22",
"10.21", "10.24", "10.27", "10.3", "10.32", "10.34", "10.37",
"10.4"), WalchemTempTank13_Std = c("0.01", "0.013", "0.005",
"0.01", "0.007", "0.006", "0.006", "0.008", "0.008", "0.005")), row.names = 4:13, class = "data.frame")
However now, based on the QC table there are some rows (date/time) that I need to remove or subset for, but only for specific columns (ie. those columns that contain the data_type_tank). I think I can do this manually, using the code below, and then binding or joining columns/rows where needed, but this seems like an arduous process.
subset_row_1_QC_table <- dataframe %>% select(Measurement.time, contains("WalchemTempTank29")) %>%
subset(Measurement.time >= as.POSIXct("2021-03-31 08:30") & Measurement.time <= as.POSIXct("2021-03-31 18:00"))
Is there any way to automate this process, removing or subsetting column-specific rows, based on columns from a different data frame? I think ideally my dataframe would look something like an expanded version of eg:
Measurement.time
Trial
WaterTempTank29_Avg
WaterTempTank29_Std
pH_Tank40_Avg
pH_Tank40_Std
2021-03-31 08:30
1
18.8
0.037
NA
NA
[all 5-min intervals]
NA
NA
2021-03-31 18:00
1
18.36
0.023
NA
NA
2021-03-31 07:50
1
NA
NA
7.854
0.001
[all 5-min intervals]
1
NA
NA
7.88
0.001
2021-03-31 08:15
1
NA
NA
7.84
0.001
2021-03-31 10:25
1
NA
NA
7.881
0.001
[all 5-min intervals]
1
NA
NA
7.804
0.001
2021-03-31 10:40
1
NA
NA
7.881
0
Any help would be greatly appreciated! I hope I've been able to explain properly my problem, first-time user of StackOverflow.
Cheers,
edit: Thanks r2evans & GuedesBF -- hope this is now better/fixed.

I would do it like this.
Prepare data for variables CF and df
CF = structure(list(trial = c(1L, 1L, 1L), data_type = c("Temp", "pH",
"pH"), tank = c("29", "40", "40"), date_time_start = structure(c(1617204600,
1617202200, 1617211500), tzone = "", class = c("POSIXct", "POSIXt"
)), date_time_end = structure(c(1617238800, 1617203700, 1617212400
), tzone = "", class = c("POSIXct", "POSIXt")), to.be.removed = c("yes ",
"yes ", "yes "), reason = c("calibration error", "faulty probe",
"faulty probe"), data_type_tank = c("WalchemTempTank29", "pH_Tank40",
"pH_Tank40")), row.names = c(NA, -3L), groups = structure(list(
data_type = c("pH", "Temp"), tank = c("40", "29"), .rows = structure(list(
2:3, 1L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = 1:2, class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
df = structure(list(Measurement.time = c("2021-03-30 11:00", "2021-03-30 11:05",
"2021-03-30 11:10", "2021-03-30 11:15", "2021-03-31 18:30", "2021-03-30 11:25",
"2021-03-30 11:30", "2021-03-30 11:35", "2021-03-31 17:00", "2021-03-31 19:28"
), Trial = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), WalchemTempTank29_Avg = c("18.8",
"18.67", "18.58", "18.48", "18.38", "18.29", "18.2", "18.12",
"18.03", "18"), WalchemTempTank29_Std = c("0.037", "0.025", "0.032",
"0.029", "0.017", "0.018", "0.026", "0.025", "0.024", "0.023"
), pH_Tank40_Avg = c("7.859", "7.863", "7.868", "7.866", "7.863",
"7.864", "7.865", "7.867", "7.869", "7.87"), pH_Tank40_Std = c("0.007",
"0.006", "0.002", "0.001", "0.002", "0.001", "0.002", "0.001",
"0.002", "0.004"), WalchemTempTank13_Avg = c("10.26", "10.22",
"10.21", "10.24", "10.27", "10.3", "10.32", "10.34", "10.37",
"10.4"), WalchemTempTank13_Std = c("0.01", "0.013", "0.005",
"0.01", "0.007", "0.006", "0.006", "0.008", "0.008", "0.005")), row.names = 4:13, class = "data.frame")
Note, I changed some data in the variable Measurement.time to include the events in the CF table.
Prepare separate tables CFTemp and CFpH
CFTemp = CF %>% ungroup() %>%
filter(data_type == "Temp") %>%
mutate(Temp = "Temp",
Temp_start = date_time_start,
Temp_end = date_time_end) %>%
select(Temp, Temp_start, Temp_end)
CFpH = CF %>% ungroup() %>%
filter(data_type == "pH") %>%
mutate(pH = "pH",
pH_start = date_time_start,
pH_end = date_time_end) %>%
select(pH, pH_start, pH_end)
Prepare two functions returning vectors with binary values and for which indices data should be removed.
fTemp = function(df) CFTemp %>% left_join(df, by="Temp") %>%
mutate(TempRm = Measurement.time>=Temp_start & Measurement.time<=Temp_end) %>%
group_by(ID) %>%
summarise(TempRm = any(TempRm)) %>%
pull(TempRm)
fpH = function(df) CFpH %>% left_join(df, by="pH") %>%
mutate(pHRm = Measurement.time>=pH_start & Measurement.time<=pH_end) %>%
group_by(ID) %>%
summarise(pHRm = any(pHRm)) %>%
pull(pHRm)
Convert the data frame
df1 = df %>% as_tibble() %>%
mutate(Measurement.time = as.POSIXct(Measurement.time),
ID = 1:nrow(.),
Temp = "Temp",
pH = "pH") %>%
mutate(
TmpRm = fTemp(.),
pHRm = fpH(.)
) %>%
mutate(
WalchemTempTank29_Avg = ifelse(TmpRm, NA, WalchemTempTank29_Avg),
WalchemTempTank29_Std = ifelse(TmpRm, NA, WalchemTempTank29_Std),
WalchemTempTank13_Avg = ifelse(TmpRm, NA, WalchemTempTank13_Avg),
WalchemTempTank13_Std = ifelse(TmpRm, NA, WalchemTempTank13_Std),
pH_Tank40_Avg = ifelse(pHRm, NA, pH_Tank40_Avg),
pH_Tank40_Std = ifelse(pHRm, NA, pH_Tank40_Std),
) %>%
select(Measurement.time:WalchemTempTank13_Std)
df1
output
# A tibble: 10 x 8
Measurement.time Trial WalchemTempTank29_Avg WalchemTempTank29_Std pH_Tank40_Avg pH_Tank40_Std WalchemTempTank13_Avg WalchemTempTank13_Std
<dttm> <int> <chr> <chr> <chr> <chr> <chr> <chr>
1 2021-03-30 11:00:00 1 18.8 0.037 7.859 0.007 10.26 0.01
2 2021-03-30 11:05:00 1 18.67 0.025 7.863 0.006 10.22 0.013
3 2021-03-30 11:10:00 1 18.58 0.032 7.868 0.002 10.21 0.005
4 2021-03-30 11:15:00 1 18.48 0.029 7.866 0.001 10.24 0.01
5 2021-03-31 18:30:00 1 NA NA 7.863 0.002 NA NA
6 2021-03-30 11:25:00 1 18.29 0.018 7.864 0.001 10.3 0.006
7 2021-03-30 11:30:00 1 18.2 0.026 7.865 0.002 10.32 0.006
8 2021-03-30 11:35:00 1 18.12 0.025 7.867 0.001 10.34 0.008
9 2021-03-31 17:00:00 1 18.03 0.024 NA NA 10.37 0.008
10 2021-03-31 19:28:00 1 NA NA NA NA NA NA
And that's all.
Update 1
library(tidyverse)
CFTemp = CF %>% ungroup() %>%
filter(data_type == "Temp") %>%
mutate(Temp = "Temp",
Temp_start = date_time_start,
Temp_end = date_time_end) %>%
select(Temp, tank, Temp_start, Temp_end)
CFpH = CF %>% ungroup() %>%
filter(data_type == "pH") %>%
mutate(pH = "pH",
pH_start = date_time_start,
pH_end = date_time_end) %>%
select(pH, pH_start, pH_end)
fTemp = function(df, Tank){
out = CFTemp %>% filter(tank==Tank) %>%
left_join(df, by="Temp") %>%
mutate(TempRm = Measurement.time>=Temp_start & Measurement.time<=Temp_end) %>%
group_by(ID) %>%
summarise(TempRm = any(TempRm)) %>%
pull(TempRm)
if(length(out)==0) FALSE else out
}
fpH = function(df) CFpH %>% left_join(df, by="pH") %>%
mutate(pHRm = Measurement.time>=pH_start & Measurement.time<=pH_end) %>%
group_by(ID) %>%
summarise(pHRm = any(pHRm)) %>%
pull(pHRm)
df1 = df %>% as_tibble() %>% #Step 1
mutate(Measurement.time = as.POSIXct(Measurement.time),
ID = 1:nrow(.),
Temp = "Temp",
pH = "pH") %>%
mutate( #Step 2
TmpRm29 = fTemp(., 29),
TmpRm13 = fTemp(., 13),
pHRm = fpH(.)
) %>%
mutate( #Step 3
WalchemTempTank29_Avg = ifelse(TmpRm29, NA, WalchemTempTank29_Avg),
WalchemTempTank29_Std = ifelse(TmpRm29, NA, WalchemTempTank29_Std),
WalchemTempTank13_Avg = ifelse(TmpRm13, NA, WalchemTempTank13_Avg),
WalchemTempTank13_Std = ifelse(TmpRm13, NA, WalchemTempTank13_Std),
pH_Tank40_Avg = ifelse(pHRm, NA, pH_Tank40_Avg),
pH_Tank40_Std = ifelse(pHRm, NA, pH_Tank40_Std),
) %>%
select(Measurement.time:WalchemTempTank13_Std)
df1
output
# A tibble: 10 x 8
Measurement.time Trial WalchemTempTank29_Avg WalchemTempTank29_Std pH_Tank40_Avg pH_Tank40_Std WalchemTempTank13_Avg WalchemTempTank13_Std
<dttm> <int> <chr> <chr> <chr> <chr> <chr> <chr>
1 2021-03-30 11:00:00 1 18.8 0.037 7.859 0.007 10.26 0.01
2 2021-03-30 11:05:00 1 18.67 0.025 7.863 0.006 10.22 0.013
3 2021-03-30 11:10:00 1 18.58 0.032 7.868 0.002 10.21 0.005
4 2021-03-30 11:15:00 1 18.48 0.029 7.866 0.001 10.24 0.01
5 2021-03-31 18:30:00 1 NA NA 7.863 0.002 10.27 0.007
6 2021-03-30 11:25:00 1 18.29 0.018 7.864 0.001 10.3 0.006
7 2021-03-30 11:30:00 1 18.2 0.026 7.865 0.002 10.32 0.006
8 2021-03-30 11:35:00 1 18.12 0.025 7.867 0.001 10.34 0.008
9 2021-03-31 17:00:00 1 18.03 0.024 NA NA 10.37 0.008
10 2021-03-31 19:28:00 1 NA NA NA NA 10.4 0.005

Error with a function inside a Lapply() R

I'm having a very strange error in a script that used to work perfectly and I don't know what's the problem. I start creating a very long list with several data frames with the exact number of columns. The list is called lst. Then I want to do a summarise table with means and sd. Here is the script for that:
w1 <- lapply(lst, function(i) t(cbind(Mean = colMeans(i[, c(6,7,8,9)], na.rm = TRUE),
Sds = colSds(as.matrix(i[, c(6,7,8,9)]), na.rm = TRUE),
N = length(i[,2]),
len.max=max(i[,6]))))
The number of the columns are correct. However when I run the script first I get the Debug location and when I stopped I get this error message:
Error in t(cbind(Mean = colMeans(i[, c(6, 7, 8, 9)], na.rm = TRUE), Sds = colSds(as.matrix(i[, :
error in evaluating the argument 'x' in selecting a method for function 't': Error in `[.data.frame`(i, , c(6, 7, 8, 9)) : undefined columns selected
I dont know whats wrong with the function. I try to search in the internet and I saw something about change as,matrix for data.matrix. However this does not make the trick.
Indeed I get the same problem for another function very similar:
a1 <- lapply(lst, function(i) t(cbind(l1 = NROW(which(i[,6]>1)),
l1.05 = NROW(which(i[,6]<=1)) - NROW(which(i[,6]>0.5)),
l05.03 = NROW(which(i[,6]>0.3)) - NROW(which(i[,6]<=0.5)),
l03 = NROW(which(i[,6]<=0.3)))))
With the same outcome:
Error in t(cbind(l1 = NROW(which(i[, 6] > 1)), l1.05 = NROW(which(i[, :
error in evaluating the argument 'x' in selecting a method for function 't': Error in `[.data.frame`(i, , 6) : undefined columns selected
Can someone point me out what is the problem. Do you need some data? Thanks!
I'm working with the last RStudio and with the following packages:
plyr, matrixStats,dplyr
Here is an example of the list:
> lst
[[1]]
X Chr new pos1 pos2 len nsnp n.ind per.ind
1 1 1 1 12900000 13700000 0.9 284.7560 23.77778 7.952434
2 2 1 2 17000000 17300000 0.4 126.5582 16.00000 5.351171
3 3 1 3 21200000 21500000 0.4 126.5582 40.75000 13.628763
4 4 1 4 45300000 45700000 0.5 158.1978 23.20000 7.759197
5 5 1 5 45900000 46600000 0.8 253.1165 31.12500 10.409699
[[2]]
X Chr new pos1 pos2 len nsnp n.ind per.ind
1 1 1 1 12900000 13700000 0.9 312.90267 24.44444 4.288499
2 2 1 2 21200000 21500000 0.4 139.06785 38.00000 6.666667
3 3 1 3 32600000 33000000 0.5 173.83482 28.40000 4.982456
4 4 1 4 35800000 36100000 0.4 139.06785 37.25000 6.535088
5 5 1 5 36300000 36300000 0.1 34.76696 22.00000 3.859649
[[3]]
X Chr new pos1 pos2 len nsnp n.ind per.ind
1 1 1 1 35700000 36500000 0.9 287.4214 12.22222 11.42264
2 2 1 2 45900000 46600000 0.8 255.4857 12.50000 11.68224
3 3 1 3 49400000 50700000 1.4 447.1000 21.78571 20.36048
4 4 1 4 51000000 52000000 1.1 351.2929 16.00000 14.95327
5 5 1 5 52200000 53000000 0.9 287.4214 19.66667 18.38006
dput(lst[1:3])
list(structure(list(X = 1:5, Chr = c(1L, 1L, 1L, 1L, 1L), new = 1:5,
pos1 = c(12900000, 1.7e+07, 21200000, 45300000, 45900000),
pos2 = c(13700000, 17300000, 21500000, 45700000, 46600000
), len = c(0.9, 0.4, 0.4, 0.5, 0.8), nsnp = c(284.756031128405,
126.558236057069, 126.558236057069, 158.197795071336, 253.116472114137
), n.ind = c(23.7777777777778, 16, 40.75, 23.2, 31.125),
per.ind = c(7.95243403939056, 5.35117056856187, 13.628762541806,
7.75919732441472, 10.4096989966555)), .Names = c("X", "Chr",
"new", "pos1", "pos2", "len", "nsnp", "n.ind", "per.ind"), row.names = c(NA,
5L), class = "data.frame"), structure(list(X = 1:5, Chr = c(1L,
1L, 1L, 1L, 1L), new = 1:5, pos1 = c(12900000, 21200000, 32600000,
35800000, 36300000), pos2 = c(13700000, 21500000, 3.3e+07, 36100000,
36300000), len = c(0.9, 0.4, 0.5, 0.4, 0.1), nsnp = c(312.90267141585,
139.0678539626, 173.83481745325, 139.0678539626, 34.76696349065
), n.ind = c(24.4444444444444, 38, 28.4, 37.25, 22), per.ind = c(4.28849902534113,
6.66666666666667, 4.98245614035088, 6.53508771929825, 3.85964912280702
)), .Names = c("X", "Chr", "new", "pos1", "pos2", "len", "nsnp",
"n.ind", "per.ind"), row.names = c(NA, 5L), class = "data.frame"),
structure(list(X = 1:5, Chr = c(1L, 1L, 1L, 1L, 1L), new = 1:5,
pos1 = c(35700000, 45900000, 49400000, 5.1e+07, 52200000
), pos2 = c(36500000, 46600000, 50700000, 5.2e+07, 5.3e+07
), len = c(0.9, 0.8, 1.4, 1.1, 0.9), nsnp = c(287.421428571429,
255.485714285714, 447.1, 351.292857142857, 287.421428571429
), n.ind = c(12.2222222222222, 12.5, 21.7857142857143,
16, 19.6666666666667), per.ind = c(11.4226375908619,
11.6822429906542, 20.3604806408545, 14.9532710280374,
18.380062305296)), .Names = c("X", "Chr", "new", "pos1",
"pos2", "len", "nsnp", "n.ind", "per.ind"), row.names = c(NA,
5L), class = "data.frame"))

Get row value corresponding to a column name

I have this dataset
Book2 <- structure(list(meanX3 = c(21.66666667, 21.66666667, 11, 25, 240.3333333
), meanX1 = c(23, 34.5, 10, 25, 233.5), meanX2 = c(24.5, 26.5,
20, 25, 246.5), to_select = structure(c(3L, 1L, 2L, 1L, 1L), .Label = c("meanX1",
"meanX2", "meanX3"), class = "factor"), selected = c(NA, NA,
NA, NA, NA)), .Names = c("meanX3", "meanX1", "meanX2", "to_select",
"selected"), class = "data.frame", row.names = c(NA, -5L))
I want to get the coresponding row value for the column name on variable to_select .
I have tried
Book2 %>% dplyr::mutate(selected=.[paste0(to_select)])
But it returns all the column values. How can I go about to get a data set like
structure(list(meanX3 = c(21.66666667, 21.66666667, 11, 25, 240.3333333
), meanX1 = c(23, 34.5, 10, 25, 233.5), meanX2 = c(24.5, 26.5,
20, 25, 246.5), to_select = structure(c(3L, 1L, 2L, 1L, 1L), .Label = c("meanX1",
"meanX2", "meanX3"), class = "factor"), selected = c(21.66, 34.5,
20, 25, 240.33)), .Names = c("meanX3", "meanX1", "meanX2", "to_select",
"selected"), class = "data.frame", row.names = c(NA, -5L))

With base R, a safe strategy would be something like
cols <- as.character(unique(Book2$to_select))
row_col <- match(Book2$to_select, cols)
idx <- cbind(seq_along(Book2$to_select), row_col)
selected <- Book2[, cols][idx]
Book2$selected <- selected
Or using tidyverse packages, something like
library(tidyverse)
Book2 %>% mutate(row=1:n()) %>%
gather(prop, val, meanX3:meanX2) %>%
group_by(row) %>%
mutate(selected=val[to_select==prop]) %>%
spread(prop, val) %>% select(-row)
Would be a decent strategy.

One way is to group by row using rowwise() and then get the value of the string in 'to_select' column
Book2 %>%
rowwise() %>%
mutate(selected = get(as.character(to_select)))
# A tibble: 5 × 5
# meanX3 meanX1 meanX2 to_select selected
# <dbl> <dbl> <dbl> <fctr> <dbl>
#1 21.66667 23.0 24.5 meanX3 21.66667
#2 21.66667 34.5 26.5 meanX1 34.50000
#3 11.00000 10.0 20.0 meanX2 20.00000
#4 25.00000 25.0 25.0 meanX1 25.00000
#5 240.33333 233.5 246.5 meanX1 233.50000

In base R you can use match to select the desired column and then matrix subsetting to select the particular element for each row like this
Book2$selected <- as.numeric(Book2[cbind(seq_len(nrow(Book2)),
match(Book2$to_select, names(Book2)))])

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

dplyr: how to use multiple variables in mutate_at funs - r

Related

Finding mean of variable across each month/year

Aggregate columns based on categories given by another dataframe

Filtering multiple, specific columns for multiple time ranges

Error with a function inside a Lapply() R

Get row value corresponding to a column name

Categories

Resources