Undefined columns data frame error - r
I will like to create a scatter plot of two variable (Disk and Band), for that I and using the function "ggscatter" that is on the "ggpubr" package. Every time I try to use the ggscatter function I get the following error
Error in [.data.frame(data, , x) : undefined columns selected
Here is my code
install.packages("ggpubr")
library("ggpubr")
my_data <- All_Data_Summer_17_
head(my_data, 6)
ggscatter(my_data, x = "band", y = "Disk",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "Band", ylab = "Disk (cm)")
Output of str(my_data)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 24 obs. of 22 variables:
$ Sample ID : chr "NP-A-1" "NP-A-2" "NP-A-3" "NP-A-4" ...
$ Lat : num 36.6 36.6 36.6 36.6 36.6 ...
$ Lon : num -95 -95 -95 -95 -95 ...
$ Temp : num 29.1 30.5 30.6 30.7 31 ...
$ SpCond : num 0.077 0.081 0.082 0.086 0.088 0.09 0.084 0.09 0.084 0.085 ...
$ Cond : int 83 90 90 95 98 99 93 99 93 96 ...
$ Resist : num 12107 11116 11066 10537 10248 ...
$ TDS : num 0.05 0.053 0.053 0.056 0.057 0.058 0.055 0.058 0.055 0.055 ...
$ Sal : num 0.03 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 ...
$ pH : num 8.87 9.41 9.56 9.77 9.61 9.38 9.89 9.67 9.89 9.85 ...
$ Chl : num 62.1 40.1 3.7 1.4 4.2 5.6 41.5 17.8 4.5 7.7 ...
$ ODO : num 5.69 8.76 8.28 8.35 8.75 ...
$ TSS : num 1.111 0.667 2.556 3.333 0.778 ...
$ TP : num 0 1.03 0.01 -0.02 -0.01 -0.03 0.01 -0.01 -0.03 0.01 ...
$ TN : num 0.2 0.3 1.9 0.3 1.1 0.5 1.6 0.9 0.5 0.7 ...
$ NO3-N : num 0.43 0.18 0.71 0.36 0.25 0.42 0.26 0.17 0.24 0.19 ...
$ NH3-N : num 0.3 0.2 -0.3 -0.1 -0.4 -0.3 -0.3 -0.3 -0.2 -0.1 ...
$ Chloro-a : num 8.23 7.19 15.37 12.6 14.22 ...
$ Disk: num 55.5 68 50 50.5 69 65 65 67.7 70 66 ...
$ band : num 0.000093 0.000096 0.000103 0.000152 0.000088 0.000089 0.000096 0.000097 0.000092 0.000101 ...
$ Green Band : num 0.000163 0.000169 0.000154 0.000276 0.00016 0.00013 0.00015 0.000175 0.000171 0.000163 ...
$ Red Band : num 0.00012 0.000145 0.000126 0.000246 0.000117 0.000095 0.000116 0.00011 0.000108 0.000126 ...
Output dput(my_data)
dput(my_data)
structure(list(`Sample ID` = c("NP-A-1", "NP-A-2", "NP-A-3",
"NP-A-4", "NP-A-5", "NP-A-6", "NP-A-7", "NP-A-8", "NP-A-9", "NP-A-10",
"NP-A-11", "NP-A-12", "NP-A-13", "NP-A-14", "NP-A-15", "NP-A-16",
"NP-A-17", "NP-B-1", "NP-B-2", "NP-B-3", "NP-B-4", "NP-B-5",
"NP-B-6", "NP-B-7"), Lat = c(36.568738, 36.569005, 36.569258,
36.569554, 36.569585, 36.569382, 36.56928, 36.568647, 36.568809,
36.569124, 36.569425, 36.569331, 36.56919, 36.569071, 36.568888,
36.568633, 36.568869, 36.568651, 36.568932, 36.56946, 36.569893,
36.570058, 36.569811, 36.56988), Lon = c(-94.96671, -94.966703,
-94.966604, -94.966647, -94.96698, -94.966928, -94.966923, -94.967296,
-94.9677, -94.967761, -94.967911, -94.968069, -94.967358, -94.968107,
-94.968018, -94.968049, -94.968293, -94.968723, -94.968833, -94.968396,
-94.968101, -94.967793, -94.967141, -94.96663), Temp = c(29.12,
30.49, 30.6, 30.71, 30.97, 30.83, 30.82, 30.64, 30.42, 31.62,
31.96, 31.16, 31.16, 32.88, 32.03, 31, 32.41, 31.79, 31.93, 32.17,
32.16, 32.55, 32.61, 32.83), SpCond = c(0.077, 0.081, 0.082,
0.086, 0.088, 0.09, 0.084, 0.09, 0.084, 0.085, 0.08, 0.079, 0.083,
0.079, 0.086, 0.094, 0.078, 0.183, 0.183, 0.183, 0.183, 0.183,
0.183, 0.183), Cond = c(83L, 90L, 90L, 95L, 98L, 99L, 93L, 99L,
93L, 96L, 91L, 88L, 93L, 90L, 97L, 105L, 89L, 206L, 207L, 208L,
208L, 209L, 210L, 210L), Resist = c(12107.2, 11115.7, 11066.2,
10537.1, 10247.7, 10051, 10700.4, 10076.5, 10753.3, 10434.4,
11023, 11304, 10741.8, 11058.1, 10270.4, 9536.35, 11269.8, 4845.53,
4834.38, 4815.44, 4814.59, 4787.82, 4770.86, 4755.86), TDS = c(0.05,
0.053, 0.053, 0.056, 0.057, 0.058, 0.055, 0.058, 0.055, 0.055,
0.052, 0.051, 0.054, 0.051, 0.056, 0.061, 0.051, 0.119, 0.119,
0.119, 0.119, 0.119, 0.119, 0.119), Sal = c(0.03, 0.04, 0.04,
0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04,
0.04, 0.04, 0.03, 0.08, 0.08, 0.08, 0.08, 0.08, 0.08, 0.08),
pH = c(8.87, 9.41, 9.56, 9.77, 9.61, 9.38, 9.89, 9.67, 9.89,
9.85, 9.46, 9.42, 9.75, 9.19, 10.02, 8.83, 9.65, 7.89, 8.14,
8.21, 8.22, 8.4, 8.21, 8.18), Chl = c(62.1, 40.1, 3.7, 1.4,
4.2, 5.6, 41.5, 17.8, 4.5, 7.7, 8.2, 7.7, 120.3, 3.1, 7.8,
3.6, 3.2, 9.8, 7.6, 6, 10, 8.1, 6.3, 4.3), ODO = c(5.69,
8.76, 8.28, 8.35, 8.75, 8.59, 10.1, 10.06, 9.14, 10.32, 9.1,
8.41, 8.03, 9.63, 9.77, 8.91, 10.16, 7.17, 7.31, 7.41, 7.49,
7.75, 6.98, 7.09), TSS = c(1.1111, 0.6667, 2.5556, 3.3333,
0.7778, -27.3333, 2.1111, -0.3333, 1.2222, -32.6667, -0.2222,
2.3333, -0.2222, 1.1111, 1.4444, 2.6667, 0.1111, 6.3333,
7, 5, 5.4444, 6.4444, 3, 2.7778), TP = c(0, 1.03, 0.01, -0.02,
-0.01, -0.03, 0.01, -0.01, -0.03, 0.01, 0.04, -0.01, -0.03,
0, 0.01, 0.03, 0.04, 0.2, -0.01, 0, -0.03, 0.04, 0.01, -0.01
), TN = c(0.2, 0.3, 1.9, 0.3, 1.1, 0.5, 1.6, 0.9, 0.5, 0.7,
0.6, 1, 0.8, 0.1, 0.4, 1.6, 0.6, 0.8, 0.6, 0.5, 0.9, 1.2,
0.3, 0.6), `NO3-N` = c(0.43, 0.18, 0.71, 0.36, 0.25, 0.42,
0.26, 0.17, 0.24, 0.19, 0.17, 0.41, 0.6, 0.23, 0.3, 0.26,
0.22, 0.32, 0.63, 0.36, 0.24, 0.33, 0.55, 0.36), `NH3-N` = c(0.3,
0.2, -0.3, -0.1, -0.4, -0.3, -0.3, -0.3, -0.2, -0.1, 0.1,
-0.2, 0.2, -0.1, -0.3, -0.1, 0.1, -0.5, 0.2, 0.5, -0.3, 0.2,
-0.4, -0.1), `Chloro-a` = c(8.23, 7.19, 15.37, 12.6, 14.22,
4.56, 7.2, 8.61, 6.31, 8.74, 5.59, 10.92, 5.24, 4.26, 5.48,
6.26, 4.75, 11.45, 10.39, 11.79, 9.59, 9.82, 7.97, 7.92),
`Disk` = c(55.5, 68, 50, 50.5, 69, 65, 65, 67.7, 70,
66, 69, 67, 69, 62, 60, 62, 66, 50, 52, 50, 40, 57, 57, 62
), `band` = c(9.3e-05, 9.6e-05, 0.000103, 0.000152,
8.8e-05, 8.9e-05, 9.6e-05, 9.7e-05, 9.2e-05, 0.000101, 0.000102,
9.6e-05, 0.000106, 8.7e-05, 9.1e-05, 0.000126, 0.000107,
0.000139, 0.000139, 0.000135, 0.000174, 0.000144, 0.000137,
0.000134), `Green Band` = c(0.000163, 0.000169, 0.000154,
0.000276, 0.00016, 0.00013, 0.00015, 0.000175, 0.000171,
0.000163, 0.000177, 0.000188, 0.000131, 0.000162, 0.000166,
0.000233, 0.000204, 0.000265, 0.00023, 0.000254, 0.000325,
0.000262, 0.000263, 0.00028), `Red Band` = c(0.00012, 0.000145,
0.000126, 0.000246, 0.000117, 9.5e-05, 0.000116, 0.00011,
0.000108, 0.000126, 0.000128, 0.000133, 9.3e-05, 0.000114,
0.000113, 0.000176, 0.000136, 0.000215, 0.000198, 0.00019,
0.000218, 0.00021, 0.000205, 0.000223)), .Names = c("Sample ID",
"Lat", "Lon", "Temp", "SpCond", "Cond", "Resist", "TDS", "Sal",
"pH", "Chl", "ODO", "TSS", "TP", "TN", "NO3-N", "NH3-N", "Chloro-a",
"Disk", "band", "Green Band", "Red Band"), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -24L), spec = structure(list(
cols = structure(list(`Sample ID` = structure(list(), class = c("collector_character",
"collector")), Lat = structure(list(), class = c("collector_double",
"collector")), Lon = structure(list(), class = c("collector_double",
"collector")), Temp = structure(list(), class = c("collector_double",
"collector")), SpCond = structure(list(), class = c("collector_double",
"collector")), Cond = structure(list(), class = c("collector_integer",
"collector")), Resist = structure(list(), class = c("collector_double",
"collector")), TDS = structure(list(), class = c("collector_double",
"collector")), Sal = structure(list(), class = c("collector_double",
"collector")), pH = structure(list(), class = c("collector_double",
"collector")), Chl = structure(list(), class = c("collector_double",
"collector")), ODO = structure(list(), class = c("collector_double",
"collector")), TSS = structure(list(), class = c("collector_double",
"collector")), TP = structure(list(), class = c("collector_double",
"collector")), TN = structure(list(), class = c("collector_double",
"collector")), `NO3-N` = structure(list(), class = c("collector_double",
"collector")), `NH3-N` = structure(list(), class = c("collector_double",
"collector")), `Chloro-a` = structure(list(), class = c("collector_double",
"collector")), `Disk` = structure(list(), class = c("collector_double",
"collector")), `band` = structure(list(), class = c("collector_double",
"collector")), `Green Band` = structure(list(), class = c("collector_double",
"collector")), `Red Band` = structure(list(), class = c("collector_double",
"collector"))), .Names = c("Sample ID", "Lat", "Lon", "Temp",
"SpCond", "Cond", "Resist", "TDS", "Sal", "pH", "Chl", "ODO",
"TSS", "TP", "TN", "NO3-N", "NH3-N", "Chloro-a", "Disk",
"band", "Green Band", "Red Band")), default = structure(list(), class = c("collector_guess",
"collector"))), .Names = c("cols", "default"), class = "col_spec"))
Ok, the easy answer is to run the correlation coefficients first, then the CIs.
Perhaps you could report the bug to ggpubr's Maintainer.
ggscatter(my_data, x = "band",
y = "Disk",
add = "reg.line",
cor.coef = FALSE,
cor.method = "pearson",
conf.int = TRUE,
xlab = "Band",
ylab = "Disk (cm)")
Related
geom_smooth not working for trendline, too few points?
I am trying to get a trendline for my two sets of averages, in my main graph I will be putting error bars on the points to show the sd's but below is a simplified version: ggplot(sl, aes(x=Stresslevel, y=Final, color=Treatment)) + geom_point() + geom_smooth(method = "lm") In my output I can see in the legend that it is trying to add it, but it is not showing on the graph: enter image description here Here is an image of the data: enter image description here Edit: Here is my data, thank you for the advice for getting it> dput(sl) structure(list(Stresslevel = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 3L, 4L, 5L), .Label = c("0", "1", "2 (30%)", "3 (50%)", "4 (70%)", "5", "Recovered"), class = "factor"), WL = c(0, 15.5, 32.8, 52.9, 69.8, 89.2, 13.5, 30, 50, 70), WLsd = c(5, 6.5, 8.1, 8.8, 10.6, 4.2, 9.8, 5, 5, 5), Final = c(0.0292, 0.0276, 0.0263, 0.0248, 0.0208, 0.0199, 0.0249, 0.0274, 0.0235, 0.0121), Treatment = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("Stressed", "Treated" ), class = "factor"), Finalsd = c(0.0039, 0.0019, 0.0026, 0.0033, 0.002, 0.0021, 0.0028, 0.0049, 0.0048, 0.0026), Dry = c(0.006, 0.008, 0.0107, 0.0139, 0.0138, 0.0174, 0.0047, 0.008, 0.0116, 0.0105), Drysd = c(0.0015, 0.0015, 0.0017, 0.0024, 0.0011, 0.0022, 0.001, 0.0016, 0.0033, 0.0021), Delta = c(0.0231, 0.0196, 0.0155, 0.0109, 0.007, 0.0025, 0.0201, 0.0194, 0.012, 0.0016), Deltasd = c(0.0034, 0.0015, 0.0019, 0.002, 0.0024, 0.001, 0.0025, 0.0043, 0.0035, 0.0013), WC = c(4.07, 2.54, 1.48, 0.81, 0.52, 0.15, 4.44, 2.48, 1.11, 0.16), WCsd = c(1.22, 0.59, 0.26, 0.21, 0.2, 0.08, 1.06, 0.56, 0.45, 0.12), CD = c(1, 1.33, 1.78, 2.31, 2.29, 2.89, 0.78, 1.33, 1.92, 1.75), CDsd = c(0.24, 0.25, 0.28, 0.4, 0.19, 0.37, 0.16, 0.26, 0.54, 0.35)), class = "data.frame", row.names = c(NA, -10L)) Any help would be greatly appreciated.
Your x variable is a factor, meaning it is a categorical variable, so it's not clear how to fit a regression line through that: str(sl) 'data.frame': 10 obs. of 14 variables: $ Stresslevel: Factor w/ 7 levels "0","1","2 (30%)",..: 1 2 3 4 5 6 7 3 4 5 $ WL : num 0 15.5 32.8 52.9 69.8 89.2 13.5 30 50 70 I am not sure if it makes sense to convert your categories to numeric, that is stresslevel 0 will be 1, stresslevel 1 be 2 etc.. and force a line: ggplot(sl, aes(x=Stresslevel, y=Final, color=Treatment)) + geom_point() + geom_smooth(aes(x=as.numeric(Stresslevel)),method = "lm",se=FALSE) I would say it might make sense to connect the lines, if it makes sense to look at the progression of your dependent variable from 0 to 5 stress: ggplot(sl, aes(x=Stresslevel, y=Final, color=Treatment)) + geom_point() + geom_line(aes(x=as.numeric(Stresslevel)),linetype="dashed")
Canonical Correlation in R with different matrix dimensions
I'm having difficulties about doing a CC analysis in R. The assignment which I'm doing is from "Applied Multivariate Analysis" by Sharma, exercise 13.7, if you're familiar with it. Basically, I'm asked to conduct a CCA on a set of variables. There are seven X variables, but only five Y variables, thus R complains that the dimensions are not compatible. See the image below for a visual representation of the data called CETNEW. Edited (Changed from image to dput): structure(list(... 1 = c("X1", "X2", "X3", "X4", "X5", "X6", "X7", "Y1", "Y2", "Y3", "Y4", "Y5"), 2 = c(2.72, 1.2, 0.82, 0.92, 1.19, 1, 1.45, 0.68, 0.98, 0.57, 1.07, 0.91), ... 3 = c(1.2, 3.78, 0.7, 1.04, 1.06, 1.32, 1.31, 0.56, 1, 0.79, 1.13, 1.38), ... 4 = c(0.82, 0.7, 1.7, 0.59, 0.83, 1.08, 1.01, 0.65, 0.78, 0.66, 0.93, 0.77), ... 5 = c(0.92, 1.04, 0.59, 3.09, 1.06, 0.93, 1.47, 0.62, 1.26, 0.51, 0.94, 0.85), ... 6 = c(1.19, 1.06, 0.83, 1.06, 2.94, 1.36, 1.66, 0.68, 1.16, 0.77, 1.37, 1.11), ... 7 = c(1, 1.32, 1.08, 0.93, 1.36, 2.94, 1.56, 0.9, 1.23, 0.78, 1.65, 1.31), ... 8 = c(1.45, 1.31, 1.01, 1.47, 1.66, 1.56, 3.11, 1.03, 1.7, 0.81, 1.63, 1.44), ... 9 = c(0.68, 0.56, 0.65, 0.62, 0.68, 0.9, 1.03, 1.71, 0.99, 0.65, 0.86, 0.72), ... 10 = c(0.98, 1, 0.78, 1.26, 1.16, 1.23, 1.7, 0.99, 3.07, 0.61, 1.43, 1.28), ... 11 = c(0.57, 0.79, 0.66, 0.51, 0.77, 0.78, 0.81, 0.65, 0.61, 2.83, 1.04, 0.84), ... 12 = c(1.07, 1.13, 0.93, 0.94, 1.37, 1.65, 1.63, 0.86, 1.43, 1.04, 2.83, 1.6), ... 13 = c(0.91, 1.38, 0.77, 0.85, 1.11, 1.31, 1.44, 0.72, 1.28, 0.84, 1.6, 4.01)), row.names = c(NA, -12L), class = c("tbl_df", "tbl", "data.frame")) What I've Done so Far CETNEW <- CETNEW[,-1] #To remove the non-numeric values Create two variables (criterion and predictor variables) as: CETNEWx <- CETNEW[1:7,] CETNEWy <- CETNEW[8:12,] Then I've been using various packages such as CCA, CCP and candisk. From CCA: ccCETNEW <- cc(CETNEWx,CETNEWy) Yields the following error message: Error in cov(X, Y, use = "pairwise") : incompatible dimensions The matcor function also from CCA, yields the following error message: Error in data.frame(..., check.names = FALSE) : arguments imply differing number of rows: 7, 5 Thus, it would seem that it all boils down to the different dimension problem. I've talked to my professor about it, but since he is using SAS, which apparently are compatible with this problem and could solve it, he could not help me. Please, if you're familiar with canonical correlation and have had a similar problem before, any help regarding this topic is highly appreciated.
If you look at your data, notice the first column is divided into X and Y labels. That suggests to me that your data are transposed. If so, each column is an observation and the X and Y labels indicate various measurements taken on each observation. Canonical correlations are performed on two groups of measurements/variables from a single set of observations. First, here is the transposed data: CETNEW.T <- structure(list(X1 = c(2.72, 1.2, 0.82, 0.92, 1.19, 1, 1.45, 0.68, 0.98, 0.57, 1.07, 0.91), X2 = c(1.2, 3.78, 0.7, 1.04, 1.06, 1.32, 1.31, 0.56, 1, 0.79, 1.13, 1.38), X3 = c(0.82, 0.7, 1.7, 0.59, 0.83, 1.08, 1.01, 0.65, 0.78, 0.66, 0.93, 0.77), X4 = c(0.92, 1.04, 0.59, 3.09, 1.06, 0.93, 1.47, 0.62, 1.26, 0.51, 0.94, 0.85 ), X5 = c(1.19, 1.06, 0.83, 1.06, 2.94, 1.36, 1.66, 0.68, 1.16, 0.77, 1.37, 1.11), X6 = c(1, 1.32, 1.08, 0.93, 1.36, 2.94, 1.56, 0.9, 1.23, 0.78, 1.65, 1.31), X7 = c(1.45, 1.31, 1.01, 1.47, 1.66, 1.56, 3.11, 1.03, 1.7, 0.81, 1.63, 1.44), Y1 = c(0.68, 0.56, 0.65, 0.62, 0.68, 0.9, 1.03, 1.71, 0.99, 0.65, 0.86, 0.72 ), Y2 = c(0.98, 1, 0.78, 1.26, 1.16, 1.23, 1.7, 0.99, 3.07, 0.61, 1.43, 1.28), Y3 = c(0.57, 0.79, 0.66, 0.51, 0.77, 0.78, 0.81, 0.65, 0.61, 2.83, 1.04, 0.84), Y4 = c(1.07, 1.13, 0.93, 0.94, 1.37, 1.65, 1.63, 0.86, 1.43, 1.04, 2.83, 1.6), Y5 = c(0.91, 1.38, 0.77, 0.85, 1.11, 1.31, 1.44, 0.72, 1.28, 0.84, 1.6, 4.01 )), class = "data.frame", row.names = c(NA, -12L)) Now the analysis runs fine: library("CCA") str(CETNEW.T) # 'data.frame': 12 obs. of 12 variables: # $ X1: num 2.72 1.2 0.82 0.92 1.19 1 1.45 0.68 0.98 0.57 ... # $ X2: num 1.2 3.78 0.7 1.04 1.06 1.32 1.31 0.56 1 0.79 ... # $ X3: num 0.82 0.7 1.7 0.59 0.83 1.08 1.01 0.65 0.78 0.66 ... # $ X4: num 0.92 1.04 0.59 3.09 1.06 0.93 1.47 0.62 1.26 0.51 ... # $ X5: num 1.19 1.06 0.83 1.06 2.94 1.36 1.66 0.68 1.16 0.77 ... # $ X6: num 1 1.32 1.08 0.93 1.36 2.94 1.56 0.9 1.23 0.78 ... # $ X7: num 1.45 1.31 1.01 1.47 1.66 1.56 3.11 1.03 1.7 0.81 ... # $ Y1: num 0.68 0.56 0.65 0.62 0.68 0.9 1.03 1.71 0.99 0.65 ... # $ Y2: num 0.98 1 0.78 1.26 1.16 1.23 1.7 0.99 3.07 0.61 ... # $ Y3: num 0.57 0.79 0.66 0.51 0.77 0.78 0.81 0.65 0.61 2.83 ... # $ Y4: num 1.07 1.13 0.93 0.94 1.37 1.65 1.63 0.86 1.43 1.04 ... # $ Y5: num 0.91 1.38 0.77 0.85 1.11 1.31 1.44 0.72 1.28 0.84 ... X <- CETNEW.T[, 1:7] Y <- CETNEW.T[, 8:12] ccCETNEW <- cc(X, Y) ccCETNEW is list with 5 parts containing the results.
How to predict with a regression model with many missing values?
I intend to analyze and build a regression model with a dummy variable as a dependent variable. I'm using the glm function, but I can't predict it. I don't want to exclude the missing values. What is the best way to make good predictions in cases where the database has many missing values? n$status <-as.factor(n$status) set.seed(900) training.samples <- n$status %>% createDataPartition(p = 0.8, list = FALSE) train.data <- n[training.samples, ] test.data <- n[-training.samples, ] model=glm(status ~.,data = train.data,family = binomial(link = "logit")) m <- data.frame(x1=mean(n$x1,na.rm=T),x2=mean(n$x2,na.rm=T)) m$predictprob <- predict(model, newdata=m, type="response") Error in eval(predvars, data, env) : object 'x1' not found When I try to make the forecast this error appears. I think it must be because of the missing values. str(n) 'data.frame': 4371 obs. of 8 variables: $ status: Factor w/ 2 levels "Active","Inactive": 1 1 1 1 1 1 1 1 1 1 ... $ x1 : num 12.2 12.4 13.1 10.9 22.7 ... $ x2 : num 4.27 2.17 5.91 5.81 7.44 ... $ x3 : num 8.3 7.71 12.41 9.34 19.57 ... $ x4 : num 2.91 1.34 5.61 4.99 6.43 ... $ x5 : num 4.51 1.83 9.11 10.68 14.23 ... $ x6 : num 3.7 4.94 12.27 11.29 15.13 ... $ x7 : num 2.22 3.4 1.12 0.84 1.11 4.07 8.15 0.79 8.16 8.86 .. dput(train.data[1:10,]) structure(list(Status = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Active", "Inactive"), class = "factor"), x1 = c(12.17, 12.41, 13.07, 10.88, 22.66, 43.54, 64.75, 255.43, 10.05, 1.84), x2 = c(4.27, 2.17, 5.91, 5.81, 7.44, 17.17, 22.51, 9.29, 0.78, 0.42), x3 = c(8.3, 7.71, 12.41, 9.34, 19.57, 33.7, 48.1, 252.75, 6.89, 2.24), x4 = c(2.91, 1.34, 5.61, 4.99, 6.43, 13.29, 16.72, 9.19, 0.53, 0.51), x5 = c(4.51, 1.83, 9.11, 10.68, 14.23, 8.99, 7.94, 19.73, 1.09, 0.2), x6 = c(3.7, 4.94, 12.27, 11.29, 15.13, 9.07, 7.94, 21.21, 0.96, 0.02), x7 = c(2.22, 3.4, 1.12, 0.84, 1.11, 4.07, 8.15, 0.79, 8.16, 8.86), row.names = c(NA, 10L), class = "data.frame")
Divide each column of a dataframe by one row of the dataframe
I would like to divide each column of my dataframe by the values of one row. I tried to transform my dataframe into a matrix and to extract one row of the dataframe as a vector then divide the matrix by the vector but it did not work. Indeed, only the first row of the matrix got divided by the vector. Here is my original dataframe. And this is the code I tried to run : data <- read_excel("Documents/TFB/xlsx_geochimie/solfatara_maj.xlsx") View(data) data.mat <- as.matrix(data[,2:20]) vector <- data[12,2:20] data.mat/vector
We replicate the vector to make the length same and then do the division data.mat/unlist(vector)[col(data.mat)] # FeO Total S SO4 Total N SiO2 Al2O3 Fe2O3 MnO MgO CaO Na2O K2O #[1,] 0.10 16.5555556 NA NA 0.8908607 0.8987269 0.1835206 0.08333333 0.03680982 0.04175365 0.04823151 0.5738562 #[2,] 0.40 125.8333333 NA NA 0.5510204 0.4456019 0.2359551 0.08333333 0.04294479 0.01878914 0.04501608 0.2588235 #[3,] 0.85 0.6111111 NA NA 1.0021295 1.0162037 0.7715356 1.08333333 0.53987730 0.69728601 1.03858521 1.0457516 #[4,] 0.15 48.0555556 NA NA 1.1027507 0.2569444 NA 0.08333333 0.01840491 0.01878914 0.04180064 0.1647059 #[5,] 0.85 NA NA NA 1.0889086 1.0271991 0.6591760 0.75000000 0.59509202 0.53862213 1.02250804 1.1228758 #[6,] NA NA NA NA 1.3426797 0.6319444 0.0411985 0.08333333 0.03067485 0.11899791 0.65594855 0.7764706 # TiO2 P2O5 LOI LOI2 Total Total 2 Fe2O3(T) #[1,] 0.7924528 0.3928571 7.0841837 6.6963855 0.9922233 0.9894632 0.14489796 #[2,] 0.5094340 0.3214286 14.5561224 13.7710843 0.9958126 0.9936382 0.31020408 #[3,] 0.8679245 0.6428571 1.5637755 1.5228916 0.9990030 0.9970179 0.80612245 #[4,] 1.4905660 0.2857143 7.4056122 7.0024096 0.9795613 0.9769384 0.05510204 #[5,] 1.0377358 0.2500000 0.3520408 0.3783133 0.9969093 0.9960239 0.74489796 #[6,] 0.3018868 0.2500000 1.2551020 1.1879518 1.0019940 1.0000000 0.04489796 Or use sweep sweep(data.mat, MARGIN = 2, unlist(vector), FUN = `/`) Or using mapply with asplit mapply(`/`, asplit(data.mat, 2), vector) data data_mat <- structure(c(0.2, 0.8, 1.7, 0.3, 1.7, NA, 5.96, 45.3, 0.22, 17.3, NA, NA, NA, 6.72, NA, 4.08, 0.06, 0.16, NA, NA, NA, NA, NA, NA, 50.2, 31.05, 56.47, 62.14, 61.36, 75.66, 15.53, 7.7, 17.56, 4.44, 17.75, 10.92, 0.49, 0.63, 2.06, NA, 1.76, 0.11, 0.01, 0.01, 0.13, 0.01, 0.09, 0.01, 0.06, 0.07, 0.88, 0.03, 0.97, 0.05, 0.2, 0.09, 3.34, 0.09, 2.58, 0.57, 0.15, 0.14, 3.23, 0.13, 3.18, 2.04, 4.39, 1.98, 8, 1.26, 8.59, 5.94, 0.42, 0.27, 0.46, 0.79, 0.55, 0.16, 0.11, 0.09, 0.18, 0.08, 0.07, 0.07, 27.77, 57.06, 6.13, 29.03, 1.38, 4.92, 27.79, 57.15, 6.32, 29.06, 1.57, 4.93, 99.52, 99.88, 100.2, 98.25, 99.99, 100.5, 99.54, 99.96, 100.3, 98.28, 100.2, 100.6, 0.71, 1.52, 3.95, 0.27, 3.65, 0.22), .Dim = c(6L, 19L), .Dimnames = list( NULL, c("FeO", "Total S", "SO4", "Total N", "SiO2", "Al2O3", "Fe2O3", "MnO", "MgO", "CaO", "Na2O", "K2O", "TiO2", "P2O5", "LOI", "LOI2", "Total", "Total 2", "Fe2O3(T)"))) vector <- structure(list(FeO = 2, `Total S` = 0.36, SO4 = NA_real_, `Total N` = NA_real_, SiO2 = 56.35, Al2O3 = 17.28, Fe2O3 = 2.67, MnO = 0.12, MgO = 1.63, CaO = 4.79, Na2O = 3.11, K2O = 7.65, TiO2 = 0.53, P2O5 = 0.28, LOI = 3.92, LOI2 = 4.15, Total = 100.3, `Total 2` = 100.6, `Fe2O3(T)` = 4.9), row.names = c(NA, -1L), class = c("tbl_df", "tbl", "data.frame"))
To divide data frame, df, by the third row: df/df[rep(3, nrow(df)), ]
How to subset columns based on value in a different column?
EDITED: I have a dataframe that stores information about when particular assessment happened ('when'). This assessment happened at different times (t1 - t3) which vary by participant. The dataframe also contains all the assessments ever completed by every participant (including the one referenced in the 'when' column). I only want the assessment information represented in the 'when' column. So if the number is 1, I want to keep all the data related to that assessment and remove all the data that was not collected at that assessment. Please note that I have many more variables in my actual data set than are represented in this shortened data set so any solution should not rely on repeating variable names. Here's the best I can do. The problem with this solution is that it would have to be repeated for every variable name. df2 <- mutate(.data = df, a1G_when = if_else(when == 1, a1G_t1, NA_real_)) # here is what we start with df <- structure(list(id = 1:10, when = c(1, 3, 2, 1, 2, 1, 3, 2, 3, 1), a1G_t1 = c(0.78, 0.21, 0.04, 0.87, 0.08, 0.25, 0.9, 0.77, 0.51, 0.5), Stqo_t1 = c(0.68, 0.77, 0.09, 0.66, 0.94, 0.05, 0.97, 0.92, 1, 0.04), Twcdz_t1 = c(0.95, 0.41, 0.29, 0.54, 0.06, 0.45, 0.6, 0.24, 0.17, 0.55), Kgh_t1 = c(0.25, 0.86, 0.37, 0.34, 0.97, 0.75, 0.73, 0.68, 0.37, 0.66), `2xWX_t1` = c(0.47, 0.52, 0.23, 0.5, 0.88, 0.71, 0.21, 0.98, 0.76, 0.21), `2IYnS_t1` = c(0.32, 0.75, 0.03, 0.46, 0.89, 0.71, 0.51, 0.83, 0.34, 0.32), a1G_t2 = c(0.97, 0.01, 0.58, 0.33, 0.58, 0.37, 0.76, 0.33, 0.39, 0.56), Stqo_t2 = c(0.78, 0.42, 0.5, 0.69, 0.09, 0.72, 0.84, 0.94, 0.46, 0.83), Twcdz_t2 = c(0.62, 0.34, 0.72, 0.62, 0.8, 0.26, 0.3, 0.88, 0.42, 0.53), Kgh_t2 = c(0.99, 0.66, 0.02, 0.17, 0.51, 0.03, 0.03, 0.74, 0.1, 0.26), `2xWX_t2` = c(0.68, 0.97, 0.56, 0.27, 0.66, 0.71, 0.96, 0.24, 0.37, 0.76), `2IYnS_t2` = c(0.24, 0.88, 0.58, 0.31, 0.8, 0.92, 0.91, 0.9, 0.55, 0.52), a1G_t3 = c(0.73, 0.6, 0.66, 0.06, 0.33, 0.34, 0.09, 0.44, 0.73, 0.56), Stqo_t3 = c(0.28, 0.88, 0.56, 0.75, 0.85, 0.33, 0.88, 0.4, 0.63, 0.61), Twcdz_t3 = c(0.79, 0.95, 0.41, 0.07, 0.99, 0.06, 0.74, 0.17, 0.89, 0.4), Kgh_t3 = c(0.06, 0.52, 0.35, 0.91, 0.43, 0.74, 0.72, 0.96, 0.39, 0.4), `2xWX_t3` = c(0.25, 0.09, 0.64, 0.32, 0.15, 0.14, 0.18, 0.33, 0.97, 0.6), `2IYnS_t3` = c(0.92, 0.49, 0.09, 0.95, 0.3, 0.83, 0.82, 0.56, 0.29, 0.36)), row.names = c(NA, -10L), class = "data.frame") # here is an example of what I want with the first column. I would also want all other repeating columns to look like this (Stq0_when, Twcdz, etc.) id when a1G_when 1 1 1 0.78 2 2 3 0.88 3 3 2 0.58 4 4 1 0.87 5 5 2 0.58 6 6 1 0.25 7 7 3 0.09 8 8 2 0.33 9 9 3 0.73 10 10 1 0.50
Using data.table, you could do something like: library(data.table) cols <- unique(paste0(gsub("_.*", "", setdiff(names(df), c("id", "when"))), "_when")) setDT(df)[ , (cols) := lapply(cols, function(x) paste0(gsub("_.*", "", x), "_t", when))][ , (cols) := lapply(cols, function(x) as.character(.SD[[get(x)]])), by = cols][ , (cols) := lapply(.SD, as.numeric), .SDcols = cols ] Output (only first 10 rows and only relevant when columns): a1G_when Stqo_when Twcdz_when Kgh_when 2xWX_when 2IYnS_when 1: 0.78 0.68 0.95 0.25 0.47 0.32 2: 0.60 0.88 0.95 0.52 0.09 0.49 3: 0.58 0.50 0.72 0.02 0.56 0.58 4: 0.87 0.66 0.54 0.34 0.50 0.46 5: 0.58 0.09 0.80 0.51 0.66 0.80 6: 0.25 0.05 0.45 0.75 0.71 0.71 7: 0.09 0.88 0.74 0.72 0.18 0.82 8: 0.33 0.94 0.88 0.74 0.24 0.90 9: 0.73 0.63 0.89 0.39 0.97 0.29 10: 0.50 0.04 0.55 0.66 0.21 0.32
Here is an opportunity to use the new tidyr::pivot_longer. We can use this to reshape the data so that var and t are in their own columns, filter to just the rows with the data we want (i.e. where t equals when) and then pivot the data back out to wide. library(tidyverse) df1 <- structure(list(ID = c(101, 102, 103, 104, 105), when = c(1, 2, 3, 1, 2), var1_t1 = c(5, 6, 4, 5, 6), var2_t1 = c(2, 3, 4, 2, 3), var1_t2 = c(7, 8, 9, 7, 8), var2_t2 = c(5, 4, 5, 4, 5), var1_t3 = c(3, 4, 3, 4, 3), var2_t3 = c(6, 7, 6, 7, 6)), row.names = c(NA, 5L), class = "data.frame") df1 %>% pivot_longer( cols = starts_with("var"), names_to = c("var", "t"), names_sep = "_t", values_to = "val", col_ptypes = list(var = character(), t = numeric()) ) %>% filter(when == t) %>% select(-t) %>% pivot_wider(names_from = "var", values_from = "val") #> # A tibble: 5 x 4 #> ID when var1 var2 #> <dbl> <dbl> <dbl> <dbl> #> 1 101 1 5 2 #> 2 102 2 8 4 #> 3 103 3 3 6 #> 4 104 1 5 2 #> 5 105 2 8 5 Created on 2019-07-16 by the reprex package (v0.3.0)