Using a loop to create table with results of ICC in r - r

I created a loop to calculate the icc between two raters.
For each rater (R1, R2) I have a data frame of the 75 variables in columns and 125 observations.
library(irr)
for (i in 1:75) {
icc <- icc(cbind.data.frame(R1[,i],R2[,i]), model="twoway", type="agreement",
unit="single")
print(icc)
}
icc returns as a list of results icc for each variable.
I tried to integrate in the loop a function that will generate a data frame for the objects of icc that interest me (value, lower and upper bounder of the 95% confident interval) but it returns in different ways separate tables:
With this first attempt it returns 75 data frames of only one line each one, even if I used an rbind command
for (i in 1:75) {
icc <- icc(cbind.data.frame(R1[,i],R2[,i]), model="twoway", type="agreement",
unit="single")
print(rbind.data.frame(cbind.data.frame(icc$value,icc$lbound,icc$ubound)))
}
in the second case it returns 75 different data frames filled each one of the icc'objects of one variable.
for (i in 1:75) {
icc <- icc(cbind.data.frame(R1[,i],R2[,i]), model="twoway", type="agreement",
unit="single")
name_lines_are_variables <- names(L1)
name_columns <- c("ICC","Low CI 95%","Up CI 95%)
tab <- matrix(c(icc$value,icc$conf.level),nrow=38,ncol=2)
dimnames(tab) <- list(name_lines_are_variables,name_columns)
print(tab)
I appreciate your help

If I've understood your post correctly, then the problem with your code is that it the results from the icc() function are not being accumulated.
You can solve this problem by declaring an empty data.frame before the for loop, and then using rbind() to append the latest results to the existing results in this data.frame.
Please refer to the code below for an implementation (refer to the comments for clarifications):
rm(list = ls())
#Packages
library(irr)
#Dummy data
R1 <- data.frame(matrix(sample(1:100, 75*125, replace = TRUE), nrow = 75, ncol = 125))
R2 <- data.frame(matrix(sample(1:100, 75*125, replace = TRUE), nrow = 75, ncol = 125))
#Data frame that will accumulate the ICC results
#Initialized with zero rows (but has named columns)
my_icc <- data.frame(R1_col = character(), R2_col = character(),
icc_val = double(), icc_lb = double(),
icc_ub = double(), icc_conflvl = double(),
icc_pval = double(),
stringsAsFactors = FALSE)
#For loop
#Iterates through each COLUMN in R1 and R2
#And calculates ICC values with these as inputs
#Each R1[, i]-R2[, j] combination's results are stored
#as a row each in the my_icc data frame initialized above
for (i in 1:ncol(R1)){
for (j in 1:ncol(R2)){
#tmpdat is just a temporary variable to hold the current calculation's data
tmpdat <- irr::icc(cbind.data.frame(R1[, i], R2[, j]), model = "twoway", type = "agreement", unit = "single")
#Results from current cauculation being appended to the my_icc data frame
my_icc <- rbind(my_icc,
data.frame(R1_col = colnames(R1)[i], R2_col = colnames(R2)[j],
icc_val = tmpdat$value, icc_lb = tmpdat$lbound,
icc_ub = tmpdat$ubound, icc_conflvl = tmpdat$conf.level,
icc_pval = tmpdat$p.value,
stringsAsFactors = FALSE))
}
}
head(my_icc)
# R1_col R2_col icc_val icc_lb icc_ub icc_conflvl icc_pval
# 1 X1 X1 0.14109954 -0.09028373 0.3570681 0.95 0.1147396
# 2 X1 X2 0.07171398 -0.15100798 0.2893685 0.95 0.2646890
# 3 X1 X3 -0.02357068 -0.25117399 0.2052619 0.95 0.5791774
# 4 X1 X4 0.07881817 -0.15179084 0.3004977 0.95 0.2511141
# 5 X1 X5 -0.12332146 -0.34387645 0.1083129 0.95 0.8521741
# 6 X1 X6 -0.17319598 -0.38833452 0.0578834 0.95 0.9297514

Thank you a lot for your help #Dunois. I just had to keep the same variable in the for() loop, because I have to compare the same variables columns for each rater, so the final code :
library(irr)
R1 <- data.frame(matrix(sample(1:100, 75*125, replace = TRUE), nrow = 75, ncol = 125))
R2 <- data.frame(matrix(sample(1:100, 75*125, replace = TRUE), nrow = 75, ncol = 125))
my_icc <- data.frame(R1_col = character(), R2_col = character(),
icc_val = double(), icc_lb = double(),
icc_ub = double(), icc_conflvl = double(),
icc_pval = double(),
stringsAsFactors = FALSE)
for (i in 1:ncol(R1)){
tmpdat <- irr::icc(cbind.data.frame(R1[, i], R2[, i]), model = "twoway", type = "agreement", unit = "single")
my_icc <- rbind(my_icc,
data.frame(R1_col = colnames(R1)[i], R2_col = colnames(R2)[i],
icc_val = tmpdat$value, icc_lb = tmpdat$lbound,
icc_ub = tmpdat$ubound, icc_conflvl = tmpdat$conf.level,
icc_pval = tmpdat$p.value,
stringsAsFactors = FALSE))
}
head(my_icc)
#R1_col R2_col icc_val icc_lb icc_ub icc_conflvl icc_pval
#1 X1 X1 0.116928667 -0.1147526 0.33551788 0.95 0.1601141
#2 X2 X2 0.006627921 -0.2200660 0.23238172 0.95 0.4773967
#3 X3 X3 -0.184898902 -0.3980084 0.04542289 0.95 0.9427605
#4 X4 X4 0.066504226 -0.1646006 0.28963006 0.95 0.2862440
#5 X5 X5 -0.035662755 -0.2603757 0.19227801 0.95 0.6196883
#6 X6 X6 -0.055329309 -0.2808315 0.17466685 0.95 0.6805675

Related

neural network with R package nnet: rubbish prediction due to overfitting?

Trying to figure out if I have an R problem or a general neural net problem.
Say I have this data:
set.seed(123)
n = 1e3
x = rnorm(n)
y = 1 + 3*sin(x/2) + 15*cos(pi*x) + rnorm(n = length(x))
df = data.frame(y,x)
df$train = sample(c(TRUE, FALSE), length(y), replace=TRUE, prob=c(0.7,0.3))
df_train = subset(df, train = TRUE)
df_test = subset(df, train = FALSE)
then you train the neural net and it looks good on the holdout:
library(nnet)
nn = nnet(y~x, data = df_train, size = 60, linout=TRUE)
yhat_nn = predict(nn, newdata = df_test)
plot(df_test$x,df_test$y)
points(df_test$x, yhat_nn, col = 'blue')
Ok, so then I thought, let's just generate new data and then predict using the trained net. But the predictions are way off:
x2 = rnorm(n)
y2 = 1 + 3*sin(x2/2) + 15*cos(pi*x2) + rnorm(n = length(x2))
df2 = data.frame(y2,x2)
plot(df2$x, df2$y)
points(df2$x, predict(nn, newdata = df2), col = 'blue')
Is this because I overfitted to the training set? I thought by splitting the original data into test-train I would avoid overfitting.
The fatal issue is that your new data frame, df2, does not have the correct variable names. As a result, predict.nnet can not find the right values.
names(df)
#[1] "y" "x" "train"
names(df2)
#[1] "y2" "x2"
Be careful when you construct a data frame for predict.
## the right way
df2 <- data.frame(y = y2, x = x2)
## and it solves the mystery
plot(df2$x, df2$y)
points(df2$x, predict(nn, newdata = df2), col = 'blue')
Another minor issue is your use of subset. It should be
## not train = TRUE or train = FALSE
df_train <- subset(df, train == TRUE) ## or simply subset(df, train)
df_test <- subset(df, train == FALSE) ## or simply subset(df, !train)
This has interesting effect:
nrow(subset(df, train == TRUE))
#[1] 718
nrow(subset(df, train = TRUE)) ## oops!!
#[1] 1000
The complete R session
set.seed(123)
n = 1e3
x = rnorm(n)
y = 1 + 3*sin(x/2) + 15*cos(pi*x) + rnorm(n = length(x))
df = data.frame(y,x)
df$train = sample(c(TRUE, FALSE), length(y), replace=TRUE, prob=c(0.7,0.3))
df_train = subset(df, train == TRUE) ## fixed
df_test = subset(df, train == FALSE) ## fixed
library(nnet)
nn = nnet(y~x, data = df_train, size = 60, linout=TRUE)
yhat_nn = predict(nn, newdata = df_test)
plot(df_test$x,df_test$y)
points(df_test$x, yhat_nn, col = 'blue')
x2 = rnorm(n)
y2 = 1 + 3*sin(x2/2) + 15*cos(pi*x2) + rnorm(n = length(x2))
df2 = data.frame(y = y2, x = x2) ## fixed
plot(df2$x, df2$y)
points(df2$x, predict(nn, newdata = df2), col = 'blue')

Converting a Nested For Loop into `sapply()` in R

I have been trying to create a series of coplots using a nested for loop but the loop takes too long to run (the original data set is very big). I have looked at similar questions and they suggest using the sapply function but I am still unclear about how to convert between the 2. I understand I need to create a plotting function to use (see below) but what I don't understand is how the i's and j's of the nested for loop into sapply arguements.
I have made some sample data, the nested for loop that I have been using and the plotting function I created that are below. Could someone walk me through how I convert my nested for loop into sapply arguements. I have been doing all of this in R. Many Thanks
y = rnorm(n = 200, mean = 10, sd = 2)
x1 = rnorm(n = 200, mean = 5, sd = 2)
x2 = rnorm(n = 200, mean = 2.5, sd = 2)
x3 = rep(letters[1:4], each = 50)
x4 = rep(LETTERS[1:8], each = 25)
dat = data.frame(y = y, x1 = x1, x2 = x2, x3 = x3, x4 = x4)
for(i in dat[, 2:3]){
for(j in dat[, 4:5]){
coplot(y ~ i | j, rows = 1, data = dat)
}
}
coplop_fun = function(data, x, y, x, na.rm = TRUE){
coplot(.data[[y]] ~ .data[[x]] | .data[[z]], data = data, rows = 1)
}
I think you might be able to use mapply here and not sapply. mapply is similar to sapply but allows for you to pass two inputs instead of one.
y = rnorm(n = 200, mean = 10, sd = 2)
x1 = rnorm(n = 200, mean = 5, sd = 2)
x2 = rnorm(n = 200, mean = 2.5, sd = 2)
x3 = rep(letters[1:4], each = 50)
x4 = rep(LETTERS[1:8], each = 25)
dat = data.frame(y = y, x1 = x1, x2 = x2, x3 = x3, x4 = x4)
for(i in dat[, 2:3]){
for(j in dat[, 4:5]){
coplot(y ~ i | j, rows = 1, data = dat)
}
}
mapply(function(x,j){coplot(dat[["y"]]~x|j,rows =1)}, dat[,2:3],dat[,4:5])
We can use a combination of functions expand.grid, formula and apply to accept character column names into coplot.
# combinations of column names for plotting
vars <- expand.grid(y = "y", x = c("x1", "x2"), z = c("x3", "x4"))
# cycle through column name variations, construct formula for each combination
apply(vars, MARGIN = 1,
FUN = function(x) coplot(
formula = formula(paste(x[1], "~", x[2], "|", x[3])),
data = dat, row = 1
)
)
Here's a tidyverse version of #nya's solution with expand.grid() and apply(). Each row in ds_plot_parameters represents a single plot. The equation variable is the string eventually passed to coplot().
Each equation is passed to purrr::walk(), which then calls coplot()
to produce one graph each. as.equation() converts the string to an equation.
ds_plot_parameters <-
tidyr::expand_grid(
v = c("x1", "x2"),
w = c("x3", "x4")
) |>
dplyr::mutate(
equation = paste0("y ~ ", v, " | ", w),
)
ds_plot_parameters$equation |>
purrr::walk(
\(e) coplot(as.formula(e), rows = 1, data = dat)
)
Gravy:
If you want to more input to the graph, then expand ds_plot_parameters to include other things like graph & axis titles.
ds_plot_parameters <-
tidyr::expand_grid(
v = c("x1", "x2"),
w = c("x3", "x4")
) |>
dplyr::mutate(
equation = paste0("y ~ ", v, " | ", w),
label_y = "Outcome (mL)",
label_x = paste(v, " (log 10)")
)
ds_plot_parameters |>
dplyr::select(
# Make sure this order exactly matches the function signature
equation,
label_x,
label_y,
) |>
purrr::pwalk(
.f = \(equation, label_x, label_y) {
coplot(
formula = as.formula(equation),
xlab = label_x,
ylab = label_y,
rows = 1,
data = dat
)
}
)
ds_plot_parameters
# # A tibble: 4 x 5
# v w equation label_y label_x
# <chr> <chr> <chr> <chr> <chr>
# 1 x1 x3 y ~ x1 | x3 Outcome (mL) x1 (log 10)
# 2 x1 x4 y ~ x1 | x4 Outcome (mL) x1 (log 10)
# 3 x2 x3 y ~ x2 | x3 Outcome (mL) x2 (log 10)
# 4 x2 x4 y ~ x2 | x4 Outcome (mL) x2 (log 10)

R Imputation With MICE

set.seed(1)
library(data.table)
data=data.table(STUDENT = 1:1000,
OUTCOME = sample(20:90, r = T),
X1 = runif(1000),
X2 = runif(1000),
X3 = runif(1000))
data[, X1 := fifelse(X1 > .9, NA_real_, X1)]
data[, X2 := fifelse(X2 > .78 & X2 < .9, NA_real_, X1)]
data[, X3 := fifelse(X3 < .1, NA_real_, X1)]
Say you have data as shown and you wish to impute values for X1, X2, X3 and leave out STUDENT and OUTCOME for the imputation processing.
I can do
library(mice)
dataIMPUTE=mice(data[, c("X1", "X2", "X3")], m = 1)
but how do I get together the imputing values from dataIMPUTE with STUDENT and OUTCOME? I am afraid that I will merge wrong and that is why I ask if you have advice for this.
One possibility is to use the complete data set in the imputation, but change the predictorMatrix so that STUDENT and OUTCOME are not used in the imputation model.
First, you need to run mice to extract the predictorMatrix (without calculating the imputation). Then you can set all columns to 0 that shouldn't be included in the imputation model. However, all your variables are still contained in your dataIMPUTE object:
set.seed(1)
library(data.table)
data=data.table(STUDENT = 1:1000,
OUTCOME = sample(20:90, r = T),
X1 = runif(1000),
X2 = runif(1000),
X3 = runif(1000))
index_1 <- sample(1:1000, 100)
index_2 <- sample(1:1000, 100)
index_3 <- sample(1:1000, 100)
data[index_1, X1 := NA_real_]
data[index_2, X2 := NA_real_]
data[index_3, X3 := NA_real_]
library(mice)
init <- mice(data, maxit = 0, print = FALSE)
# extract the predictor matrix
pred_mat <- init$predictorMatrix
# remove STUDENT and OUTCOME as predictors
pred_mat[, c("STUDENT", "OUTCOME")] <- 0
# do the imputation
dataIMPUTE = mice(data, pred = pred_mat, m = 1)

Creating a t-test loop over a dataframe using an index

So, let's say I have a 1000-row, 6-column dataframe, the columns are a1, a2, b1, b2, c1, c2. I want to run some t-tests using a's, b's, and c's and get an output df with 3 columns for the t-values of a-b-c and another three for the significance information for those values, making it a total of 6 columns. The problem I have is with rows, I want to loop over chunks of 20, rendering the output a (1000/20=)50-row, 6-column df.
I have already tried creating an index column for my inital df which repeats a 1 for the first 20 row, a 2 for the next 20 row and so on.
convert_n <- function(df) {
df <- df %T>% {.$n_for_t_tests = rep(c(1:(nrow(df)/20)), each = 20)}
}
df <- convert_n(df)
However, I can't seem to find a way to properly utilize the items in this column as indices for a "for" or any kind of loop.
Below you can see the relevant code for that creates a 1-row, 6-column df; I need to modify the [0:20] parts, create a loop that does this for 20 groups and binds them.
t_test_a <- t.test(df$a1[0:20], dfff$a2[0:20], paired = T, conf.level
= 0.95)
t_test_b <- t.test(df$b1[0:20], dfff$b2[0:20], paired = T, conf.level
= 0.95)
t_test_c <- t.test(df$c1[0:20], dfff$c2[0:20], paired = T, conf.level
= 0.95)
t_tests_df <- data.frame(t_a = t_test_a$statistic[["t"]],
t_b = t_test_b$statistic[["t"]],
t_c = t_test_c$statistic[["t"]])
t_tests_df <- t_tests_df %T>% {.$dif_significance_a = ifelse(.$t_a >
2, "YES", "NO")} %T>%
{.$dif_significance_b = ifelse(.$t_b >
2, "YES", "NO")} %T>%
{.$dif_significance_c = ifelse(.$t_c >
2, "YES", "NO")} %>%
dplyr::select(t_a, dif_significance_a,
t_b, dif_significance_b,
t_c, dif_significance_c)
Thank you in advance for your help.
You can use split() and sapply():
set.seed(42)
df <- data.frame(a1 = sample(1000, 1000), a2 = sample(1000, 1000),
b1 = sample(1000, 1000), b2 = sample(1000, 1000),
c1 = sample(1000, 1000), c2 = sample(1000, 1000))
group <- gl(50, 20)
D <- split(df, group)
myt <- function(Di)
with(Di, c(at=t.test(a1, a2)$statistic, ap=t.test(a1, a2)$p.value,
bt=t.test(b1, b2)$statistic, bp=t.test(b1, b2)$p.value,
ct=t.test(c1, c2)$statistic, cp=t.test(c1, c2)$p.value))
sapply(D, FUN=myt) ### or
t(sapply(D, FUN=myt))
This is not the most pretty but i did a for loop like this:
df <- data.frame(a1 = sample(1000, 1000),
a2 = sample(1000, 1000),
b1 = sample(1000, 1000),
b2 = sample(1000, 1000),
c1 = sample(1000, 1000),
c2 = sample(1000, 1000))
df_ttest <- data.frame(p_a = c(1:50),
t_a = c(1:50),
p_b = c(1:50),
t_b = c(1:50),
p_c = c(1:50),
t_c = c(1:50))
index <- 0:50*20
for(i in seq_along(index)) {
df_ttest$p_a[i] = t.test(df$a1[index[i] : index[i+1]])$p.value
df_ttest$p_b[i] = t.test(df$b1[index[i] : index[i+1]])$p.value
df_ttest$p_c[i] = t.test(df$c1[index[i] : index[i+1]])$p.value
df_ttest$t_a[i] = t.test(df$a1[index[i] : index[i+1]])$statistic
df_ttest$t_b[i] = t.test(df$b1[index[i] : index[i+1]])$statistic
df_ttest$t_c[i] = t.test(df$c1[index[i] : index[i+1]])$statistic
}
This gives a 50x6 dataframe with seperate columns of p and t values for every 20 row chunk of a, b and c.
You could even go further and make a nested for loop to cycle through each row in df_ttest to make this abit prettier.

nls: Loop and break in a decided number of iterations

I've like two make to sequential operations:
1) Ajusted two nls models in a subset; and
2) Loop the models just a number of iteracions =1.
For the first step I make:
#Packages
library(minpack.lm)
# Data set - Diameter in function of Feature and Age
Feature<-sort(rep(c("A","B"),22))
Age<-c(60,72,88,96,27,
36,48,60,72,88,96,27,36,48,60,72,
88,96,27,36,48,60,27,27,36,48,60,
72,88,96,27,36,48,60,72,88,96,27,
36,48,60,72,88,96)
Diameter<-c(13.9,16.2,
19.1,19.3,4.7,6.7,9.6,11.2,13.1,15.3,
15.4,5.4,7,9.9,11.7,13.4,16.1,16.2,
5.9,8.3,12.3,14.5,2.3,5.2,6.2,8.6,9.3,
11.3,15.1,15.5,5,7,7.9,8.4,10.5,14,14,
4.1,4.9,6,6.7,7.7,8,8.2)
d<-dados <- data.frame(Feature,Age,Diameter)
str(d)
#Create a nls model (Levenberg-Marquardt algoritm) for each Feature (A abd B)
e1<- Diameter ~ a1 * Age^a2
Fecture_vec<-unique(d$Feature)
mod_ND <- list() #List for save each model
for(i in 1:length(Fecture_vec)){
d2 <- subset(d, d$Feature == Fecture_vec[i])
mod_ND[[i]] <- nlsLM(e1, data = d2,
start = list(a1 = 0.1, a2 = 10),
control = nls.control(maxiter = 1000))
print(summary(mod_ND[[i]]))
}
#
Here, so far so good, but if I try to make a loop with 999 simulation and recycle the start values with coef(mod_ND[[i]])[1] and coef(mod_ND[[i]])[2] and stop when number of iterations is 1:
e1<- Diameter ~ a1 * Age^a2
Fecture_vec<-unique(d$Feature)
mod_ND <- list() #List for save each model
for(i in 1:length(Fecture_vec)){
d2 <- subset(d, d$Feature == Fecture_vec[i])
mod_ND[[i]] <- nlsLM(e1, data = d2,
start = list(a1 = 0.1, a2 = 10),
control = nls.control(maxiter = 1000))
Xs<-data.frame()
for(z in 1:999){
d2 <- subset(d, d$Feature == Fecture_vec[i])
mod_ND[[z]] <- nlsLM(e1, data = d2,
start = list(a1 = coef(mod_ND[[i]])[1], a2 = mod_ND[[i]])[2]),
control = nls.control(maxiter = 1000))
if (mod_ND[[z,c(finIter")]] <= 1){ break } ## Stop when iteractions =1
print(summary(mod_ND[[z]]))
}
}
#
Doesn't work!! Please any ideas?
#Packages
library(minpack.lm)
library(dplyr)
m<-function(d, a=0.01,b=10){
mod<- nlsLM(Diameter ~ a1 * Age^a2,start = list(a1 = a, a2 = b),control = nls.control(maxiter = 1000), data = d)
par1<- summary(mod)$coefficients[[1]]
par2 <- summary(mod)$coefficients[[2]]
print(summary(mod))
if(mod$convInfo[["finIter"]]>1){
m(d,par1,par2)
}else{
print(" --------Feature B-----------")
}
}
list_models <- dlply(d,.(Feature),m)
list_models

Resources