Solution (thanks #Peter_Evan!) in case anyone coming across this question has a similar issue
(Original question is below)
## get all slopes (lm coefficients) first
# list of subfields of interest to loop through
sf <- c("left_presubiculum", "right_presubiculum",
"left_subiculum", "right_subiculum", "left_CA1", "right_CA1",
"left_CA3", "right_CA3", "left_CA4", "right_CA4", "left_GC-ML-DG",
"right_GC-ML-DG")
# dependent variables are sf, independent variable common to all models in the inner lm() call is ICV
# applies the lm(subfield ~ ICV, dataset = DF) to all subfields of interest (sf) specified previously
lm.results <- lapply(sf, function(dv) {
temp.lm <- lm(get(dv) ~ ICV, data = DF)
coef(temp.lm)
})
# returns a list, where each element is a vector of coefficients
# do.call(rbind, ) will paste them together
lm.coef <- data.frame(sf = sf,
do.call(rbind, lm.results))
# tidy up name of intercept variable
names(lm.coef)[2] <- "intercept"
lm.coef
## set up all components for the equation
# matrix to store output
out <- matrix(ncol = length(sf), nrow = NROW(DF))
# name the rows after each subject
row.names(out) <- DF$Subject
# name the columns after each subfield
colnames(out) <- sf
# nested for loop that goes by subject (j) and subfield (i)
for(j in DF$Subject){
for (i in sf) {
slope <- lm.coef[lm.coef$sf == i, "ICV"]
out[j,i] <- as.numeric( DF[DF$Subject == j, i] - (slope * (DF[DF$Subject == j, "ICV"] - mean(DF$ICV))) )
}
}
# check output
out
===============
Original Question:
I have a dataframe (DF) with 13 columns (12 different brain subfields, and one column containing total intracranial volume(ICV)) and 50 rows (each a different participant). I'm trying to automate an equation being looped over every column for each participant.
The data:
structure(list(Subject = c("sub01", "sub02", "sub03", "sub04",
"sub05", "sub06", "sub07", "sub08", "sub09", "sub10", "sub11",
"sub12", "sub13", "sub14", "sub15", "sub16", "sub17", "sub18",
"sub19", "sub20"), ICV = c(1.50813, 1.3964237, 1.6703585, 1.4641886,
1.6351018, 1.5524641, 1.4445532, 1.6384505, 1.6152434, 1.5278011,
1.4788126, 1.4373356, 1.4109637, 1.3634952, 1.3853583, 1.4855268,
1.6082085, 1.5644998, 1.5617522, 1.4304141), left_subiculum = c(411.225013,
456.168033, 492.968477, 466.030173, 533.95505, 476.465524, 448.278213,
476.45566, 422.617374, 498.995121, 450.773906, 461.989663, 549.805272,
452.619547, 457.545623, 451.988333, 475.885847, 490.127968, 470.686415,
494.06548), left_CA1 = c(666.893596, 700.982955, 646.21927, 580.864234,
721.170599, 737.413139, 737.683665, 597.392434, 594.343911, 712.781376,
733.157168, 699.820162, 701.640861, 690.942843, 606.259484, 731.198846,
567.70879, 648.887718, 726.219904, 712.367433), left_presubiculum = c(325.779458,
391.252815, 352.765098, 342.67797, 390.885737, 312.857458, 326.916867,
350.657957, 325.152464, 320.718835, 273.406949, 305.623938, 371.079722,
315.058313, 311.376271, 319.56678, 348.343569, 349.102678, 322.39908,
306.966008), `left_GC-ML-DG` = c(327.037756, 305.63224, 328.945065,
238.920358, 319.494513, 305.153183, 311.347404, 259.259723, 295.369164,
312.022281, 324.200989, 314.636501, 306.550385, 311.399107, 295.108592,
356.197094, 251.098248, 294.76349, 317.308576, 301.800253), left_CA3 = c(275.17038,
220.862237, 232.542718, 170.088695, 234.707172, 210.803287, 246.861975,
171.90896, 220.83478, 236.600832, 246.842024, 239.677362, 186.599097,
224.362411, 229.9142, 293.684776, 172.179779, 202.18936, 232.5666,
221.896625), left_CA4 = c(277.614028, 264.575987, 286.605092,
206.378619, 281.781858, 258.517989, 269.354864, 226.269982, 256.384436,
271.393257, 277.928824, 265.051581, 262.307377, 266.924683, 263.038686,
306.133918, 226.364556, 262.42823, 264.862956, 255.673948), right_subiculum = c(468.762375,
445.35738, 446.536018, 456.73484, 521.041823, 482.768261, 487.2911,
456.39996, 445.392976, 476.146498, 451.775611, 432.740085, 518.170065,
487.642399, 405.564237, 487.188989, 467.854363, 479.268714, 473.212833,
472.325916), right_CA1 = c(712.973011, 717.815214, 663.637105,
649.614586, 711.844375, 779.212704, 862.784416, 648.925038, 648.180611,
760.761704, 805.943016, 717.486756, 801.853608, 722.213109, 621.676321,
791.672796, 605.35667, 637.981476, 719.805053, 722.348921), right_presubiculum = c(327.285242,
364.937865, 288.322641, 348.30058, 341.309111, 279.429847, 333.096795,
342.184296, 364.245998, 350.707173, 280.389853, 276.423658, 339.439377,
321.534798, 302.164685, 328.365751, 341.660085, 305.366589, 320.04127,
303.83284), `right_GC-ML-DG` = c(362.391907, 316.853532, 342.93274,
282.550769, 339.792696, 357.867386, 342.512721, 277.797528, 309.585721,
343.770416, 333.524912, 302.505077, 309.063135, 291.29361, 302.510461,
378.682679, 255.061044, 302.545288, 313.93902, 297.167161), right_CA3 = c(307.007404,
243.839349, 269.063801, 211.336979, 249.283479, 276.092623, 268.183349,
202.947849, 214.642782, 247.844657, 291.206598, 235.864996, 222.285729,
201.427853, 237.654913, 321.338801, 199.035108, 243.204203, 236.305659,
213.386702), right_CA4 = c(312.164065, 272.905586, 297.99392,
240.765062, 289.98697, 306.459566, 284.533068, 245.965817, 264.750571,
296.149675, 290.66935, 264.821461, 264.920869, 246.267976, 266.07378,
314.205819, 229.738951, 274.152503, 256.414608, 249.162404)), row.names = c(NA,
-20L), class = c("tbl_df", "tbl", "data.frame"))
The equation:
adjustedBrain(participant1) = rawBrain(participant1) - slope*[ICV(participant1) - (mean of all ICV measures included in the calculation of the slope)]
The code (which is not working and I was hoping for some pointers):
adjusted_Brain <- function(DF, subject) {
subfields <- colnames(select(DF, "left_presubiculum", "right_presubiculum",
"left_subiculum", "right_subiculum", "left_CA1", "right_CA1",
"left_CA3", "right_CA3", "left_CA4", "right_CA4", "left_GC-ML-DG",
"right_GC-ML-DG"))
out <- matrix(ncol = length(subfields), nrow = NROW(DF))
for (i in seq_along(subfields)) {
DF[i] = DF[DF$Subject == "subject", "i"] -
slope * (DF[DF$Subject == "subject", "ICV"] -
mean(DF$ICV))
}
}
Getting this error:
Error: Can't subset columns that don't exist.
x Column `i` doesn't exist.
A few notes:
The slopes for each subject for each subfield will be different (and will come from a regression) -> is there a way to specify that in the function so the slope (coefficient from the appropriate regression equation) gets called in?
I have my nrow set to the number of participants right now in the output because I'd like to have this run through EVERY subject across EVERY subfield and spit out a matrix with all the adjusted brain volumes... But that seems very complicated and so for now I will just settle for running each participant separately.
Any help is greatly appreciated!
As others have noted in the comments, there are quite a few syntax issues that prevent your code from running, as well as a few unstated requirements. That aside, I think there is enough to recommend a few improvements that you can hopefully build on. Here are the top line changes:
You likely don't need this to be a function, but rather a nested for loop (if you want to do this with base R). As written, the code isn't flexible enough to merit a function. If you intend to apply this many times across different datasets, a function might make sense. However, it will require a much larger rewrite.
Assuming you are fitting a simple regression via lm, then you can pull out the coefficient of interest via the $ operator and indexing (see below). Some thought will need to go into how to handle different models in the loop. Here, we assume you only need one coefficient from one model.
There are a few areas where the syntax is incorrect and a review of sub setting in base R would be helpful. Others have pointed out in the comments were some of these are.
Here is one approach were we loop through each subject (j) through each feature or subfield (i) and store them in a matrix (out). This is just an approach and will almost certainly need tweaking on your end!
#NOTE: the dataset your provided is saved as x in this example.
#fit a linear model - here we assume there is only one coef. of interest, but you may need to alter
# depending on how the slope changes in each calculation
reg <- lm(ICV ~ right_CA3, x)
# view the coeff.
reg$coefficients
# pull out the slope by getting the coeff. of interest (via index) from the reg object
slope <- reg$coefficients[[1]]
# list of features/subfeilds to loop through
sf <- c("left_presubiculum", "right_presubiculum",
"left_subiculum", "right_subiculum", "left_CA1", "right_CA1",
"left_CA3", "right_CA3", "left_CA4", "right_CA4", "left_GC-ML-DG",
"right_GC-ML-DG")
# matrix to store output
out <- matrix(ncol = length(sf), nrow = NROW(x))
#name the rows after each subject
row.names(out) <- x$Subject
#name the columns after each sub feild
colnames(out) <- sf
# nested for loop that goes by subject (j) and features/subfeilds (i)
for(j in x$Subject){
for (i in sf) {
out[j,i] <- as.numeric( x[x$Subject == j, i] - (slope * (x[x$Subject == j, "ICV"] - mean(x$ICV))) )
}
}
# check output
out
I have the following reproducible data:
MyScaledData contains scaled values between 0 and 1 for 6 variables. minvec and maxvec are named vectors and contain the maximum and minimum values from the original data set that was used to create the scaled data frame MyScaledData. minvec and maxvec contain values for all 22 variables of the original data set, including the 6 variables I have now in MyScaledData.
X14863 X15066 X15067 X15068 X15069 X15070
0.6014784 0.6975109 0.5043208 0.15284648 0.9416364 0.7860731
0.2495215 0.7801444 0.6683925 0.13768245 0.4277954 0.2058412
0.6167705 0.3344044 0.9254125 0.12777565 0.3826231 0.2590457
0.1227380 0.4448501 0.3961802 0.19117246 0.7789835 0.7587897
0.7299760 0.6375931 0.5760061 0.44746838 0.3634903 0.1079679
0.1988647 0.7814712 0.6572054 0.71409305 0.6715690 0.4029459
0.5041371 0.6374958 0.9333635 0.89057831 0.5716711 0.7219823
0.5774327 0.7677038 0.7622717 0.45288270 0.2817869 0.2572325
0.6809509 0.6089656 0.8191862 0.01151454 0.2780449 0.4655353
0.5754383 0.5662045 0.7003630 0.62559642 0.2865510 0.1847980
MyScaledData<-structure(list(X14863=c(0.601478444979532,0.249521497274968,0.616770466379489,0.122737966507165,0.729975993009922,0.198864661389536,0.504137054265617,0.577432671357089,0.680950947164095,0.575438259547452),X15066=c(0.697510926657699,0.780144354632397,0.334404422875259,0.444850091405716,0.637593061483412,0.781471212351781,0.637495834667556,0.7677038048039,0.608965550162107,0.566204459603197),X15067=c(0.50432083998529,0.668392530333367,0.925412484830622,0.396180214305286,0.576006062451239,0.657205387087382,0.933363470346907,0.762271729415789,0.819186151914183,0.700362991098644),X15068=c(0.152846483002917,0.137682446305942,0.127775652495726,0.191172455317975,0.447468375530484,0.714093046059637,0.890578310935752,0.452882699805154,0.011514536383708,0.625596417031532),X15069=c(0.94163636689763,0.427795395079331,0.38262308941233,0.77898345642139,0.363490265569212,0.671568951210917,0.571671115989958,0.281786881885636,0.278044876559552,0.286551022600823),X15070=c(0.786073059382553,0.205841229942702,0.259045736299276,
0.758789694211416,0.107967864736275,0.402945912782515,0.721982268066207,0.257232456508833,0.46553533255268,0.184798001614338)),row.names=c(NA,10L),class="data.frame"); minvec<-c(X14861=22.95,X14862=29.95,X14863= 39.95,X15066=59.95,X15067=79.95,X15068=14.99,X15069=24.99,X15070=33.45,X15071=36.95,X15072=44.95,X15073=54.95,X15074=74.95,X15132=12.95,X15548=12.95,X15549=22.95,X15550=29.95,X15551=39.95,X15552=59.95,X15553=79.95,X15956=49.95,X15957=49.95,X16364=3.5);maxvec<-c(X14861=29.99,X14862=39.99,X14863=49.99,X15066=79.99,X15067=99.99,X15068=19.99,X15069=29.99,X15070=39.99,X15071=49.99,X15072=59.99,X15073=79.99,X15074=99.99,X15132=19.99,X15548=19.99,X15549=29.99,X15550=39.99,X15551=49.99,X15552=79.99,X15553=99.99,X15956=59.99,X15957=59.99,X16364=9.99)
I want to rescale back MyScaledData to their original scale by matching the min/max values to each corresponding column based on name. I've tried the following:
descale <- function(x,minval,maxval) {x*(maxval-minval) + minval}
as.data.frame(Map(descale,MyScaledData,minvec,maxvec))
The output I get has more than 6 columns than MyScaledData has. I sense that the function is not even matching columns by names and therefore the output is not calculated correctly. How can I match the function by column name so it takes the corresponding minvec and maxvec element for each column and return only the 6 columns I have?
Desired output shall be:
MyDeScaledData <- structure(list(X14863 = c(45.9888435875945, 42.4551958326407,46.1423754824501, 41.1822891837319, 47.2789589698196, 41.9466012003509,45.0115360248268, 45.7474240204252, 46.7867475095275, 45.7274001258564), X15066 = c(73.9281189702203, 75.5840928668332, 66.6514646344202,68.8647958317706, 72.7273649521276, 75.6106830955297, 72.7254165267378,75.3347842482702, 72.1536696252486, 71.2967373704481), X15067 = c(90.0565896333052,93.3445863078807, 98.4952661960057, 87.8894514946779, 91.4931614915228,93.1203959572311, 98.654603945752, 95.2259254574924, 96.3664904843602,93.9852743416168), X15068 = c(15.7542324150146, 15.6784122315297,15.6288782624786, 15.9458622765899, 17.2273418776524, 18.5604652302982,19.4428915546788, 17.2544134990258, 15.0475726819185, 18.1179820851577), X15069 = c(29.6981818344881, 27.1289769753967, 26.9031154470616,28.8849172821069, 26.8074513278461, 28.3478447560546, 27.8483555799498,26.3989344094282, 26.3802243827978, 26.4227551130041), X15070 = c(38.5909178083619,34.7962016438253, 35.1441591153973, 38.4124846001427, 34.1561098353752,36.0852662695976, 38.171764033153, 35.1323002655678, 36.4946010748945,34.6585789305578)), row.names = c(NA, 10L), class = "data.frame")
Thanks to #Shirin Yavari for providing the solution:
MyDeScaledData<-as.data.frame(Map(descale,MyScaledData,minvec[names(MyScaledData)],maxvec[names(MyScaledData)]))
I have a data frame defined as follows:
model_comp
logLik IC Lack of fit Res var
W2.4 -353.2939 716.5878 1.361885e-01 26.80232
baro5 -353.2936 718.5871 NaN 27.04363
LL.5 -353.2940 718.5880 NaN 27.04384
LL.3 -360.3435 728.6871 3.854799e-04 29.99842
W1.3 -360.3842 728.7684 3.707592e-04 30.01948
W1.4 -360.3129 730.6258 7.850947e-05 30.25028
LL.4 -360.3170 730.6340 7.818416e-05 30.25243
The best model fit is the one with the lowest IC (information criteria). I want to use the best fit to do some plotting etc... So I created:
> bestmodel <- noquote(paste0(as.name(rownames(model_comp[which.min(model_comp$IC),])),"()"))
> bestmodel
[1] W2.4()
I want to use the W2.4() as a function call to a the DRC package.
For example this call works when manually specified:
drm(y~x,logDose = 10, fct=W2.4())
I'm trying to use the value in bestmodel instead to do something like:
drm(y~x,logDose = 10,fct = as.formula(paste(bestmodel)))
I've tried all the options given here with no success. I've messed with as.formula(), noquote(), as.name() with no success.
I also tried as.name(paste0(as.name(bestmodel),"()")) where I didn't add on the "()" in the bestmodel definition above. Still no dice.
model_comp <- structure(list(logLik = c(-353.293902612472, -353.293568997018,
-353.294024776211, -360.343530770823, -360.384220907907, -360.312897918459,
-360.317018443052), IC = c(716.587805224944, 718.587137994035,
718.588049552421, 728.687061541646, 728.768441815814, 730.625795836919,
730.634036886105), `Lack of fit` = c(0.136188459104035, NaN,
NaN, 0.000385479884900107, 0.000370759187117765, 7.85094742623572e-05,
7.81841606352332e-05), `Res var` = c(26.8023196097934, 27.0436263934882,
27.0438389102235, 29.9984226526044, 30.0194755526501, 30.2502847248304,
30.2524338881051)), .Names = c("logLik", "IC", "Lack of fit",
"Res var"), row.names = c("W2.4", "baro5", "LL.5", "LL.3", "W1.3",
"W1.4", "LL.4"), class = "data.frame")
Just using noquote() not to draw the quotes around a string doesn't turn a character value into an executable piece of code. There is a big different in R between a character value an a symbol or function call. You can't really just replace one with the other.
So let's say you have extracted the character value from the rownames
x <- "W2.4"
This is basically the string version of the function you want. You can get the value of a symbol (in this case the function W2.4 from the drc:package) from its string name with get(). So you can call
drm(y~x, logDose = 10, fct = get(x)())
Note the extra parenthesis. The get(x)-call returns the W2.4 function, and the second set of parenthesis calls that function returned by get().
Using the ryegrass dataset that comes with the drc package, we can see that these two lines return the same thing
drm(rootl ~ conc, data = ryegrass, fct = W2.4())
drm(rootl ~ conc, data = ryegrass, fct = get(x)())