Using dplyr to create a ICCs table - r

I am trying to create a table with ICCs for multiple raters and multiple variables, I am trying to use a function and dplyr, but it is not working as I expected.
This is the structure of the data frame and the expected ICCs table:
# Create data frame
ID <- c("r1", "r1", "r1", "r1", "r1", "r2", "r2", "r2", "r2", "r2", "r3", "r3", "r3", "r3", "r3")
V1.1 <- c(3, 3, 3, 3, 3, 3, 2, 3, 3, 1, 2, 2, 1, 1, 2)
V2.1 <- c(1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 2, 1, 1, 3)
V3.1 <- c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)
V4.1 <- c(2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1, 2)
V1.2 <- c(3, 3, 3, 3, 3, 3, 2, 3, 2, 2, 3, 2, 1, 2, 1)
V2.2 <- c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2)
V3.2 <- c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)
V4.2 <- c(2, 4, 2, 1, 3, 2, 1, 3, 2, 2, 3, 2, 1, 2, 1)
df <- data.frame(ID, V1.1, V2.1, V3.1, V4.1, V1.2, V2.2, V3.2, V4.2)
# Empty data frame for ICCs
ids <- c("r1", "r2", "r3")
vars <- c("V1", "V2", "V3", "V4")
icc_table <- data.frame(ID = ids)
icc_table <- cbind(icc_table, matrix(NA, nrow = length(ids), ncol = length(vars)))
names(icc_table)[2:ncol(icc_table)] <- vars
Here is the attempt to create the ICCs table with a function and dplyr:
# ICC function
icc.fun <- function(data, x1, x2){
result <- irr::icc(subset(data, select = c(x1, x2)),
model = "twoway",
type = "agreement",
unit = "single")
result$value
}
# Table attempt
icc_table <- df %>%
pivot_longer(cols = -ID, names_to = c("criteria", ".value"), names_pattern = "(V\\d)\\.(\\d)") %>%
group_by(ID, criteria) %>%
rename("val1" = `1`, "val2" = `2`) %>%
summarise(icc = icc.fun(df, val1, val2), .groups = "drop") %>%
pivot_wider(id_cols = ID, names_from = criteria, values_from = icc)
However, it is not working and it returns a table with a lot of NAs. When I tried the function it seems to be working fine, so I guess it is a problem with the dplyr code. If you have any other solution apart from dplyr it is also welcomed!
Thanks!

I think the issue is between the subset() in your icc.fun and summarise(), try:
# ICC function
icc.fun <- function(x1, x2){
result <- irr::icc(data.frame(x1, x2)),
model = "twoway",
type = "agreement",
unit = "single")
result$value
}
# Table attempt
icc_table <- df %>%
pivot_longer(cols = -ID, names_to = c("criteria", ".value"), names_pattern = "(V\\d)\\.(\\d)") %>%
group_by(ID, criteria) %>%
rename("val1" = `1`, "val2" = `2`) %>%
summarise(icc = icc.fun(val1, val2), .groups = "drop") %>%
pivot_wider(id_cols = ID, names_from = criteria, values_from = icc)

In case it is useful for someone, here is the solution that I found:
I simplified the function by subsetting the data using R base
# ICC function
icc.fun <- function(data, x1, x2){
result <- icc(data[ ,c(x1, x2)],
model = "twoway",
type = "agreement",
unit = "single")
result$value
}
I used the group_modify() instead of summarise(), plus enframe()
# Create ICC table
icc_table <- df %>%
pivot_longer(cols = -ID, names_to = c("criteria", ".value"), names_pattern = "(V\\d)\\.(\\d)") %>%
group_by(ID, criteria) %>%
rename("val1" = `1`, "val2" = `2`) %>%
group_modify(~ {
icc.fun(.x, "val1", "val2") %>%
tibble::enframe(name = "variable", value = "icc")
}) %>%
pivot_wider(id_cols = ID, names_from = criteria, values_from = icc)

Related

Using map() function to apply for each element

I need, with the help of the map() function, apply the above for each element
How can I do so?
As dt is of class data.table, you can make a vector of columns of interest (i.e. your items; below I use grepl on the names), and then apply your weighting function to each of those columns using .SD and .SDcols, with by
qs = names(dt)[grepl("^q", names(dt))]
dt[, (paste0(qs,"wt")):=lapply(.SD, \(q) 1/(sum(!is.na(q))/.N)),
.(sex, education_code, age), .SDcols = qs]
As mentioned in the comments, you miss a dt <- in your dt[, .(ID, education_code, age, sex, item = q1_1)] which makes the column item unavailable in the following line dt[, no_respond := is.na(item)].
Your weighting scheme is not absolutely clear to me however, assuming you want to do what is done in your code here, I would go with dplyr solution to iterate over columns.
# your data without no_respond column and correcting missing value in q2_3
dt <- data.table::data.table(
ID = c(1,2,3,4, 5, 6, 7, 8, 9, 10),
education_code = c(20,50,20,60, 20, 10,5, 12, 12, 12),
age = c(87,67,56,52, 34, 56, 67, 78, 23, 34),
sex = c("F","M","M","M", "F","M","M","M", "M","M"),
q1_1 = c(NA,1,5,3, 1, NA, 3, 4, 5,1),
q1_2 = c(NA,1,5,3, 1, 2, NA, 4, 5,1),
q1_3 = c(NA,1,5,3, 1, 2, 3, 4, 5,1),
q1_text = c(NA,1,5,3, 1, 2, 3, 4, 5,1),
q2_1 = c(NA,1,5,3, 1, 2, 3, 4, 5,1),
q2_2 = c(NA,1,5,3, 1, 2, 3, 4, 5,1),
q2_3 = c(NA,1,5,3, 1, NA, NA, 4, 5,1),
q2_text = c(NA,1,5,3, 1, NA, 3, 4, 5,1))
dt %>%
group_by(sex, education_code, age) %>% #groups the df by sex, education_code, age
add_count() %>% #add a column with number of rows in each group
mutate(across(starts_with("q"), #for each column starting with "q"
~ 1/(sum(!is.na(.))/n), #create a new column following your weight calculation
.names = '{.col}_wgt')) %>% #naming the new column with suffix "_wgt" to original name
ungroup()

How to define score of popularity for list of tags in R?

There is a dataset where each object has a list of tags of categories comma separated. I would like to have aggregated categories score per object based on categories' popularities. I can define the sum, min, and max of popularities but it's not clear to me how an aggregated score can be calculated.
library(tidyverse)
library(tibble)
library(stringr)
# 1. Data
df <- tribble(
~object, ~category,
1, "Software, Model, Cloud",
2, "Model",
3, "Cloud, Software",
4, "Train, Test, Model",
5, "Test, Model"
)
# 2. List of categories
list_category <- trimws(unlist(str_split(df$category, ",")))
# 3. Categories popularity
data.frame(category = list_category) %>%
group_by(category) %>%
summarise(n_count = n()) %>%
arrange(-n_count) %>%
ungroup()
# 4. Outcome with undefined 'score_category' feature that I'd like to know how to score
tribble(
~object, ~sum_category, ~min_category, ~max_category, ~score_category,
1, sum(c(2, 4, 2)), min(c(2, 4, 2)), max(c(2, 4, 2)), NA,
2, sum(c(4)), min(c(4)), max(c(4)), NA,
3, sum(c(2, 2)), min(c(2, 2)), max(c(2, 2)), NA,
4, sum(c(1, 2, 4)), min(c(1, 2, 4)), max(c(1, 2, 4)), NA,
5, sum(c(2, 4)), min(c(2, 4)), max(c(2, 4)), NA
)
Any ideas and code are welcome!

Data manipulation for plotting in r, getting NULLs

I once used the code below to sort my data for a dumbbell plot, I tried reusing the code for different data but I am getting an empty output
data10 <- structure(list(Trial_type = c(1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1,
1), Trial_type2 = c(1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1), GROUP = c("LLL",
"LLL", "LLL", "LRL", "LRL", "LRL", "RLR", "RLR", "RLR", "RRR",
"RRR", "RRR"), conditon2 = c("CEN_LLL", "IPS_LLL", "CTL_LLL",
"CEN_LRL", "IPS_LRL", "CTL_LRL", "CEN_RLR", "IPS_RLR", "CTL_RLR",
"CEN_RRR", "IPS_RRR", "CTL_RRR"), condition20 = c(1, 2, 3, 1,
2, 3, 1, 2, 3, 1, 2, 3), Training = c("left", "left", "left",
"right", "right", "right", "left", "left", "left", "right", "right",
"right"), AveResultantVel_102 = c(2.150005313, 1.854148813, 1.647962313,
2.35681725, 2.067673063, 1.10213475, 2.364870813, 2.027195438,
1.61692725, 2.111901813, 2.026179, 1.595148125), AveResultantVel_104 = c(2.37879375,
2.127869563, 1.903676063, 2.932732875, 2.230088313, 1.311275125,
2.69564575, 2.473001938, 1.926669438, 2.54519575, 2.201091438,
1.902556875)), row.names = c(NA, -12L), class = c("tbl_df", "tbl",
"data.frame"))
library(tidyverse)
data10A <- data10 %>% select(conditon2,Trial_type,AveResultantVel_102,AveResultantVel_104) %>% mutate("key"="Change in resultant velocity (cm/s)")
data10A$Trial_type <- factor(data10A$Trial_type, levels = 1:2, labels = c("Retention", "Transfer"))
i1 <- grepl("_RR", levels(data10A$conditon2))
i2 <- grepl("_RL", levels(data10$conditon2))
i3 <- grepl("_LL", levels(data10A$conditon2))
RRR_levels <- levels(data10A$conditon2)[i1]
RLR_levels <- levels(data10A$conditon2)[i2]
LLL_levels <- levels(data10A$conditon2)[i3]
LRL_levels <- levels(data10A$conditon2)[!i1 & !i2 & !i3]
ord_levels <- c(LLL_levels, RRR_levels, RLR_levels, LRL_levels)
data10A$conditon2 <- factor(data10A$conditon2, levels = ord_levels)
condition2 is of character type so it does not have levels. Change it to factor.
data10A$conditon2 <- factor(data10A$conditon2)
An alternative would be to use unique(data10A$conditon2) instead of levels(data10A$conditon2) which will work for both character and factor data.
Complete code -
library(dplyr)
data10A <- data10 %>%
select(conditon2,Trial_type,AveResultantVel_102,AveResultantVel_104) %>%
mutate("key"="Change in resultant velocity (cm/s)")
data10A$Trial_type <- factor(data10A$Trial_type, levels = 1:2, labels = c("Retention", "Transfer"))
data10A$conditon2 <- factor(data10A$conditon2)
i1 <- grepl("_RR", levels(data10A$conditon2))
i2 <- grepl("_RL", levels(data10A$conditon2))
i3 <- grepl("_LL", levels(data10A$conditon2))
RRR_levels <- levels(data10A$conditon2)[i1]
RLR_levels <- levels(data10A$conditon2)[i2]
LLL_levels <- levels(data10A$conditon2)[i3]
LRL_levels <- levels(data10A$conditon2)[!i1 & !i2 & !i3]
ord_levels <- c(LLL_levels, RRR_levels, RLR_levels, LRL_levels)
data10A$conditon2 <- factor(data10A$conditon2, levels = ord_levels)

How to mutate columns in R based on ordering of subset of these columns?

To begin with, let's suppose we have a dataset like this:
data <- data.frame(
id = 1:5,
time = c(0.1, 0.2, 0.1, 0.1, 0.2),
obj_a_size = c(1, 3, 8, 4, 2),
obj_a_cuteness = c(3, 6, 4, 1, 2),
obj_b_size = c(5, 4, 4, 2, 5),
obj_b_cuteness = c(6, 2, 10, 9, 6),
obj_c_size = c(3, 6, 7, 1, 6),
obj_c_cuteness = c(10, 1, 6, 8, 8)
)
It has columns concerning whole experiment (like time) and object-specific columns (like X_size and X_cuteness). These objects are ordered randomly, though, so I'd like to mutate these column to order the objects by size for each experiment separately. The result I expect to be like that:
data <- data.frame(
id = 1:5,
time = c(0.1, 0.2, 0.1, 0.1, 0.2),
obj_max_size = c(5, 6, 8, 4, 6),
obj_max_cuteness = c(6, 1, 4, 1, 8),
obj_2nd_size = c(3, 4, 7, 2, 5),
obj_2nd_cuteness = c(10, 2, 6, 9, 6),
obj_min_size = c(1, 3, 3, 1, 2),
obj_min_cuteness = c(3, 6, 10, 8, 2)
)
Notice that cuteness isn't ordered descending or ascending, but I want cuteness to be considered part of an object and set obj_max_cuteness = obj_2_cuteness wherever obj_max_size = obj_2_size, and so on.
Number of objects is known in advance (there are four of them), columns are known as well, and there are four columns describing each object. There is no missing data. I'm willing to use any package, if necessary. Also, original dataset is about 500k by 30, so bonus points for quick or memory-friendly code.
EDIT: Some noticed that the description is not very clear. What I'm after is a bit object-oriented thing: in the case above each object within experiment could be described as such (X in obj_X_ means that it belongs to experiment no. X):
obj_1_a = {"size": 1, "cuteness": 3}
obj_1_b = {"size": 5, "cuteness": 6}
obj_1_c = {"size": 3, "cuteness": 10}
obj_2_a = {"size": 3, "cuteness": 6}
...
I want to reorder them by size so that (in the resulting data frame):
obj_1_max = {"size": 5, "cuteness": 6}
obj_1_2nd = {"size": 3, "cuteness": 10}
obj_1_min = {"size": 1, "cuteness": 3}
obj_2_max = {"size": 6, "cuteness": 1}
...
Is this what you are after?
The min and max value calculations are straightforward. To find the 2nd max you need to do a bit more work. My interpretation of the 2nd values is that it is the 2nd value of the sorted and unique values. My output differs from yours but that may be due to a different interpretation of what you mean by the 2nd value. My reading: you are looking for the first value down from the max value; from the groups of 3 columns (size, cuteness).
library(dplyr)
data <- data.frame(
id = 1:5,
time = c(0.1, 0.2, 0.1, 0.1, 0.2),
obj_a_size = c(1, 3, 8, 4, 2),
obj_a_cuteness = c(3, 6, 4, 1, 2),
obj_b_size = c(5, 4, 4, 2, 5),
obj_b_cuteness = c(6, 2, 10, 9, 6),
obj_c_size = c(3, 6, 7, 1, 6),
obj_c_cuteness = c(10, 1, 6, 8, 8)
)
obj_max_size <- data %>%
pivot_longer(cols = contains('size')) %>%
group_by(id) %>%
summarise(obj_max_size = max(value)) %>%
ungroup() %>%
select(obj_max_size)
obj_min_size <- data %>%
pivot_longer(cols = contains('size')) %>%
group_by(id) %>%
summarise(obj_min_size = min(value)) %>%
ungroup() %>%
select(obj_min_size)
obj_2nd_size <- data %>%
pivot_longer(cols = contains('size')) %>%
group_by(id) %>%
distinct(value) %>%
arrange(desc(value)) %>%
slice(2) %>%
ungroup() %>%
select(obj_2nd_size = value)
obj_max_cuteness <- data %>%
pivot_longer(cols = contains('cuteness')) %>%
group_by(id) %>%
summarise(obj_max_cuteness = max(value)) %>%
ungroup() %>%
select(obj_max_cuteness)
obj_min_cuteness <- data %>%
pivot_longer(cols = contains('cuteness')) %>%
group_by(id) %>%
summarise(obj_min_cuteness = min(value)) %>%
ungroup() %>%
select(obj_min_cuteness)
obj_2nd_cuteness <- data %>%
pivot_longer(cols = contains('cuteness')) %>%
group_by(id) %>%
distinct(value) %>%
arrange(desc(value)) %>%
slice(2) %>%
ungroup() %>%
select(obj_2nd_cuteness = value)
output <- bind_cols(id = data$id, obj_max_size, obj_min_size, obj_2nd_size, obj_max_cuteness, obj_min_cuteness, obj_2nd_cuteness)
With output looking like this:
> output
# A tibble: 5 x 7
id obj_max_size obj_min_size obj_2nd_size obj_max_cuteness obj_min_cuteness obj_2nd_cuteness
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 5 1 3 10 3 6
2 2 6 3 4 6 1 2
3 3 8 4 7 10 4 6
4 4 4 1 2 9 1 8
5 5 6 2 5 8 2 6

Find overlapping rows between dataframes using dplyr?

df1 <- data_frame(time1 = c(0, 1, 2, 3, 4, 5, 6, 7, 8, 9),
time2 = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
id = c("a", "b", "c", "d", "e", "f", "g", "h", "i", "j"))
df2 <- data_frame(time = sort(runif(100, 0, 10)),
C = rbinom(100, 1, 0.5))
For every row in df1, I want to find the rows in df2 that overlap for time, then assign the median C value for this group of df2 rows to a new column in df1. I'm sure there's some simple way to do this with dplyr's between function, but I'm new to R and haven't been able to figure it out. Thanks!
Here's a way, using the merge function to basically do a SQL style cross join, then using the between function:
library(tidyverse)
merge(df1, df2, all = TRUE) %>%
rowwise() %>%
mutate(time_between = between(time, time1, time2)) %>%
filter(time_between) %>%
group_by(time1, time2, id) %>%
summarise(med_C = median(C))
Using the filter function may result in losing some rows from df1, so an alternative method would be:
merge(df1, df2, all = TRUE) %>%
rowwise() %>%
mutate(time_between = between(time, time1, time2)) %>%
group_by(time1, time2, id) %>%
summarise(med_C = median(ifelse(time_between, C, NA), na.rm = TRUE))
You can do this in base R with sapply:
df1$median_c <- sapply(seq_along(df1$id), function(i) {
median(df2$C[df2$time > df1$time1[i] & df2$time < df1$time2[i]])
})

Resources