I want to perform log-normalisation on myy data, and since some enteries are 0.0000 in my dataframe I want to put some very small value of the order 1e-7, so that after performing log normalisation, I don't get -Inf as the value stored.
I'm writing the following code in my console:
for(i in 1:nrow(genes_rpkm_rep_colN))
{
for(j in 1:ncol(genes_rpkm_rep_colN))
{
if(genes_rpkm_rep_colN[i,j] == 0.0000000){
genes_rpkm_rep_colN[i,j] <- 1e-7
}
}
}
I'm encountering the following error while running this piece of code:
Error in if (genes_rpkm_rep_colN[i, j] == 0) { :
missing value where TRUE/FALSE needed
I've put a true/false boolean condition in the if() statement, yet the error.
I'm share a small piece of my data below so that you can have a look and check that my data isn't the one causing the error.
> dput(genes_rpkm_rep_colN[1:10,1:30])
structure(list(X42MGBA_CENTRAL_NERVOUS_SYSTEM = c(0.0093774,
3.99494, 0.0208305, 0.0065619, 0.0084466, 0.0085095, 0.0174268,
0.0233318, 0.0530461, 0.0699613), X8MGBA_CENTRAL_NERVOUS_SYSTEM = c(0,
4.6815, 0.0188461, 0.0118735, 0.0152838, 0.0230965, 0.0157667,
0.0070364, 0.0319951, 0.101274), A1207_CENTRAL_NERVOUS_SYSTEM = c(0.0432576,
2.96619, 0.0137272, 0.0259454, 0, 0.0336463, 0.0114842, 0, 0.0553488,
7.44429), A172_CENTRAL_NERVOUS_SYSTEM = c(0.0194699, 2.92748,
0.0216248, 0.0272483, 0, 0.0176679, 0.0180913, 0.0080738, 0.0665414,
0.0387354), AM38_CENTRAL_NERVOUS_SYSTEM = c(0.0115334, 2.69758,
0.0085399, 0.0322822, 0.0069257, 0, 0.0357226, 0.0063769, 0.0471195,
0.271525), CAS1_CENTRAL_NERVOUS_SYSTEM = c(0.10065, 4.8228, 0.0958194,
0.0469533, 0.0518052, 0.069588, 0.0979765, 0.0556501, 0.117486,
0.147798), CCFSTTG1_CENTRAL_NERVOUS_SYSTEM = c(0.0440228, 6.04641,
0.019558, 0.0246441, 0.0158612, 0.0079897, 0.0163623, 0.0073022,
0.0601819, 0.118238), CH157MN_CENTRAL_NERVOUS_SYSTEM = c(0.0120244,
3.41429, 0.0053421, 0.0235595, 0.0173293, 0.0043646, 0.0044692,
0.0139616, 0.0408118, 0.181811), D283MED_CENTRAL_NERVOUS_SYSTEM = c(0.0638066,
5.12254, 0.0250124, 0.057781, 0.0135231, 0.0272476, 0.0279006,
0.0124515, 0.0583877, 0.343494), D341MED_CENTRAL_NERVOUS_SYSTEM = c(0.0418829,
4.97037, 0.0348888, 0.0219808, 0.0377255, 0.0380065, 0.058376,
0.0217101, 0.0937822, 1.3228), DAOY_CENTRAL_NERVOUS_SYSTEM = c(0.0277923,
4.16543, 0.051447, 0.0194477, 0.016689, 0.0336267, 0.0602569,
0.0460997, 0.0633229, 0.317934), DBTRG05MG_CENTRAL_NERVOUS_SYSTEM = c(0.062215,
4.22423, 0.0307115, 0.0580469, 0.0622661, 0.012546, 0.0128466,
0.0171996, 0.72017, 0.192542), DKMG_CENTRAL_NERVOUS_SYSTEM = c(0.0061458,
2.58862, 0.0546082, 0.0086011, 0.0332147, 0.0446161, 0.0571067,
0.0866511, 0.0985031, 0.128385), GAMG_CENTRAL_NERVOUS_SYSTEM = c(0.0638691,
4.18606, 0.023646, 0.0595902, 0.0095882, 0.0676175, 0.0296734,
0.0264853, 0.0953419, 1.13302), GB1_CENTRAL_NERVOUS_SYSTEM = c(0.0332071,
4.09682, 0.0122941, 0.0232368, 0.0199406, 0.0100446, 0.0205706,
0.036721, 0.15393, 8.77573), GI1_CENTRAL_NERVOUS_SYSTEM = c(0.0236971,
2.99664, 0.0315838, 0.0132657, 0.008538, 0.0344062, 0.0528461,
0.0196535, 0.0826642, 0.132007), GMS10_CENTRAL_NERVOUS_SYSTEM = c(0.112392,
3.29799, 0, 0.0058257, 0.007499, 0.0151096, 0.0232076, 0.0069047,
0.0392457, 0.0786757), GOS3_CENTRAL_NERVOUS_SYSTEM = c(0.0785394,
3.06583, 0.0793018, 0.0349735, 0.0128625, 0.0194374, 0.0464408,
0.0207256, 0.149777, 0.205972), H4_CENTRAL_NERVOUS_SYSTEM = c(0.0412065,
5.11983, 0.0416065, 0.0209705, 0.0337421, 0.0543895, 0.0417697,
0.018641, 0.0953581, 0.432261), HS683_CENTRAL_NERVOUS_SYSTEM = c(0.0395662,
4.82034, 0.0087891, 0.016612, 0.0285111, 0, 0.0294118, 0.0164074,
0.0708759, 0.240087), IOMMLEE_CENTRAL_NERVOUS_SYSTEM = c(0.0089568,
3.07764, 0, 0.0188027, 0.0080677, 0.0406391, 0.0083226, 0.0037142,
0.0295557, 0.178196), KALS1_CENTRAL_NERVOUS_SYSTEM = c(0.0212606,
3.22541, 0.0094454, 0.0059509, 0.0076601, 0.0154343, 0.0790207,
0.0105796, 0.0440979, 0.135353), KG1C_CENTRAL_NERVOUS_SYSTEM = c(0.0306739,
3.25635, 0.0292018, 0.0674589, 0.007894, 0.0397642, 0.0814343,
0.0036343, 0.107415, 0.248463), KNS42_CENTRAL_NERVOUS_SYSTEM = c(0.0377038,
2.77745, 0.0598239, 0.0075381, 0.0097032, 0, 0, 0.0044672, 0.0660162,
0.128592), KNS60_CENTRAL_NERVOUS_SYSTEM = c(0.0308664, 2.75686,
0.0571377, 0.0359982, 0, 0.0186731, 0.0095603, 0, 0.0606269,
0.214931), KNS81_CENTRAL_NERVOUS_SYSTEM = c(0.0376095, 4.39526,
0.041772, 0.0328967, 0.0169382, 0.0341286, 0.0349465, 0.003899,
0.0864295, 0.0841772), KS1_CENTRAL_NERVOUS_SYSTEM = c(0.0113846,
1.91478, 0.0252892, 0.0318656, 0.0102545, 0.0413236, 0.0317354,
0.004721, 0.0295168, 0.18686), LN18_CENTRAL_NERVOUS_SYSTEM = c(0.0159147,
4.40237, 0, 0.0371213, 0.0191134, 0.0192557, 0.0197172, 0.0219985,
0.0600177, 0.358841), LN215_CENTRAL_NERVOUS_SYSTEM = c(0.0188976,
6.19285, 0.0209891, 0, 0, 0.0257228, 0.0175595, 0.0274276, 0.05345,
0.422964), LN229_CENTRAL_NERVOUS_SYSTEM = c(0.0042589, 4.66724,
0.0189209, 0.0059603, 0.0153445, 0, 0.0316585, 0.0070643, 0.0602291,
0.169461)), row.names = c("DDX11L1", "WASH7P", "MIR1302-11",
"FAM138A", "OR4G4P", "OR4G11P", "OR4F5", "RP11-34P13.7", "CICP27",
"AL627309.1"), class = "data.frame")
Maybe try this without a loop:
library(dplyr)
df |>
mutate(across(everything(), ~ifelse(.x == 0.0000000, 1e-7, .x)),
across(everything(), ~log(.x), .names = "log_{col}"))
Related
I'm trying to predict the gender for some samples that have missing values. I'm doing that using the gene expression.
So first I train the logistic regression model using the samples that don't have missing values (that have the value male or female). This is the data I'm using (called mydata) to train the model, and here in the Gender feature, the 1 are males and 0 are females:
structure(list(CA5B = c(30.8594477594147, 30.8773853294407, 31.5109543268185,
29.852812443292, 31.9303544611987, 32.1541109784662, 32.6520127984013,
32.9726252284503, 31.4152036112846, 32.6206677736732), DDX3X = c(35.25792,
35.17134, 36.28966, 36.08013, 36.2734, 35.60448, 36.01073, 36.28618,
35.42917, 35.85764), EIF1AX = c(32.12871, 31.99721, 33.5218,
34.90091, 33.33981, 33.07818, 32.95223, 34.47241, 31.50087, 32.53821
), VAX2 = c(26.0371, 23.2217, 19.53356, 23.92908, 22.51166, 22.45692,
23.62209, 19.53356, 19.53356, 19.53356), KLRC1 = c(30.35354,
28.63985, 25.67501, 26.18108, 30.0377, 29.63008, 25.20041, 28.79883,
30.04889, 31.12243), KLRC2 = c(30.69315, 29.72534, 23.88161,
28.60153, 30.28375, 28.74612, 24.03185, 25.71121, 28.1028, 30.75633
), ARSD = c(31.6010966942421, 31.2081406187661, 32.525989520392,
33.4006989772133, 31.8554455039159, 32.3438989185126, 32.103684088194,
32.2785447752453, 32.028984695614, 31.5829276898759), DDX43 = c(29.90975,
28.0152, 26.15494, 25.70774, 26.4806, 27.44477, 30.52285, 31.97889,
31.50345, 26.90941), RPS4Y1 = c(35.94301, 36.79795, 38.03506,
26.53381, 29.87951, 37.13222, 35.91265, 26.53172, 35.37051, 37.71164
), TRAPPC2 = c(31.73251, 32.12647, 32.91964, 33.16043, 32.28315,
33.24194, 31.20461, 31.56589, 32.482, 34.21314), SNCG = c(28.78017,
33.80945, 31.28264, 35.49992, 31.63203, 29.34577, 29.78785, 30.73165,
29.9412, 26.04425), KDM6A = c(34.19294, 34.71109, 33.94433, 34.64027,
34.93768, 34.25181, 34.2198, 34.88605, 33.38825, 34.8068), ZFX = c(33.84244,
34.04817, 33.83408, 34.90102, 34.77175, 33.54326, 34.39611, 34.50292,
33.27768, 33.87074), PNPLA4 = c(31.15101, 31.32295, 33.38545,
34.34879, 30.98438, 32.77684, 31.26002, 32.36503, 31.15222, 32.12835
), KDM5C = c(33.6612, 34.3589, 33.50819, 34.56994, 34.46354,
33.27832, 34.10299, 34.48084, 34.4775, 34.5186), SMC1A = c(34.18368,
33.39101, 34.2632, 34.28327, 34.15166, 33.94223, 34.71688, 34.61705,
33.99106, 33.76364), DDX3Y = c(34.14224, 34.8835, 34.7245, 26.66744,
29.06797, 34.71189, 33.96947, 26.66531, 34.68055, 34.48187),
SYAP1 = c(32.03834, 32.42337, 32.51431, 33.51916, 32.82407,
32.4735, 32.49154, 33.51064, 31.29551, 31.83166), Gender = c(1,
1, 1, 0, 0, 1, 1, 1, 1, 1)), row.names = c("EA595454", "EA595500",
"EA595522", "EA595529", "EA595597", "EA595624", "EA595632", "EA595635",
"EA595647", "EA595654"), class = "data.frame")
Code:
split = sample.split(mydata, SplitRatio = 0.8)
train_reg = subset(mydata, split == "TRUE")
test_reg = subset(mydata, split == "FALSE")
logistic_model = glm(Gender~., data = train_reg, family = binomial)
predict_reg = predict(logistic_model, test_reg, type = "response")
predict_reg = ifelse(predict_reg >0.5, 1, 0)
This produced AUC of 0.75 (on the test set). Not bad.
Then I take only the samples with the missing values of gender, and predict if they are male or female using the model.
pred = predict(logistic_model,mydata_NA_samples)
This is some of the results I get:
Pt1 Pt10 Pt101 Pt103 Pt106 Pt11 Pt17 Pt18
1548291146811975 -443770882316732 100625892356271 420508521495519 1756507132742650 -883868739619674 -262910227380331 2442533193074350
Pt2 Pt24 Pt26 Pt27 Pt28 Pt29 Pt3 Pt30
569411355627798 1699537030844227 -703783585812457 3495433064250008 -399805416449645 -339035064434972 2024260475793067 109885153661113
Pt31 Pt34 Pt36 Pt37 Pt38 Pt39 Pt4 Pt44
-367070086585505 1330361581729001 1740587250736183 3489930082447853 -976790159879838 1751865170092986 -283113980482947 1902539723154004
Pt46 Pt47 Pt48 Pt49 Pt5 Pt52 Pt59 Pt62
1412716353779596 1108256151592894 1074657527777400 -113959545517722 109187189819909 -57895108035064 792635620314 255566834903770
Pt65 Pt66 Pt67 Pt72 Pt77 Pt78 Pt79 Pt8
-46167159563698 -346701109064255 51185327645114 -795349064523229 244860086302444 4635500642717655 926236606202554 645399266579567
Pt82 Pt84 Pt85 Pt89 Pt9 Pt90 Pt92 Pt94
-651113408988261 -641572344400162 -594901636707441 1514453985992888 -227744411687312 166300730517187 2842003327373200 2502780813663413
I mean, what is this? I'm supposed to get 0 or 1, and maybe some very small number that is close to 0, but this is very strange. I should mention that mydata and mydata_NA_samples have the exact same features, but of course just different samples. How could this happen in logistic regression, which in the first place should only return a binary result?
Thanks!
You forgot to add type="response" in the second predict call.
I have a data.frame called sites_sp where I'm trying to run some functions based on if and else statements. sites_sp has the following structure:
structure(list(x = c(-50.1298257841559, -49.9523708108406, -49.8600298829818,
-49.8590735594872, -49.8600022102151, -49.680556540172), y = c(-29.2498490060132,
-29.1594734717135, -29.0700140387022, -28.9795033961473, -28.8900003372153,
-28.8945716273705), ua = c("ua_1", "ua_4", "ua_10", "ua_15",
"ua_21", "ua_23"), occ = c(0, 0, 0, 0, 0, 0), PC1 = c(0.403336553595704,
-0.209623013249306, -2.38969068562858, -1.0875631345167, 0.0424075103800285,
-1.69180948954307), PC2 = c(-3.62346919232857, -4.03856503375702,
-1.46862258765078, -1.77908267718137, -2.0250031837701, -0.952927464794925
), PC3 = c(-0.375601733371977, -0.122982261539736, -0.365818414058142,
-0.111150398019996, 0.287459840686463, 0.034973266100254), PC4 = c(-1.31153262462204,
-0.899941801783298, -1.35652371929479, -1.98693913441246, -1.75393016363327,
-0.788097574287776), PC5 = c(1.42830395246321, 1.55155187773266,
1.33933059031444, 0.0760013457702872, 0.588191290690648, -0.408003273953271
)), row.names = c(NA, 6L), class = "data.frame")
What I'm doing is an if and else statements of form:
for(s in sp){
if(sum(sites_sp$occ >= 30)){
pa_data <- st_as_sf(sites_sp,
coords = c("x", "y"),
crs = crs(env_terra))
...
} else {
block of functions for the statement being FALSE
}
}
RELEVANT EDIT: From what I can tell, the function is going directly to the else block even though it should not — since sum(sites_sp$occ) is bigger than 30 for the first s in sp
I can't really understand what's going on. If I try sum(sites_sp$occ) it returns for me a value of 37, implying that the function inside the if block (pa_data <- st_as_sf()...) should run normally. What am I doing wrong here? If more information is needed, please tell me.
Ok, guys...I'm kinda dumb.
The problem is simply here:
if(sum(sites_sp$occ >= 30)){
Should be written as
if(sum(sites_sp$occ) >= 30){
My condition was inside the sum
I have a for-loop it looks like that:
for (ID in rownames(countDF)) {
avector <- as.vector(as.numeric(countDF2[rownames(countDF2)==ID,]))
nbfit <- fitdistr(avector,'negative binomial')
}
So I want to calculate the fitdistr function for each of IDs. But the problem is that for some of the IDs the function doesn't work and throws an error. Here it is:
Error in stats::optim(x = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, :
non-finite finite-difference value [2]
I want to skip these IDs somehow and continue with the others.
I've found a function try, but I don't understand how is it working.
I've tried it like this:
nbfir <- try(fitdistr(avector,'negative binomial'))
But the loop still breaks down with the error.
What should I do to fix it ?
You could use tryCatch and do nothing on catching an error.
for (ID in rownames(countDF)) {
avector <- as.vector(as.numeric(countDF2[rownames(countDF2)==ID,]))
tryCatch(
nbfit <- fitdistr(avector,'negative binomial'),
error = function(e) {})
}
I created the following function to determine the lag of two variables.
However, this function takes only two parameters, and I would like to run it over my whole dataset:
datSel <- structure(list(stat.resProp.Dwell.4 = c(0.000887705, 0.007954085,
-0.025859667, 0.024097552, 0.114052787, 0.023329207, 0.042143181,
-0.092587287, -0.004050228, -0.001624696, 0.020121403, -0.100502922,
0.057354185, 0.025463388, 0.037409854, 0.001561281, -0.028482938,
-0.004827041, 0.014411779, -0.029034298, 0.021053409, -0.067963182,
0.032070259, -0.038091783, 0.039751534, 0.027802281, -0.027802281,
-0.013355791, 0.009201236, -0.073403679, 0.021277398, -0.033901552,
0.012624153, -0.065733979, 0.032017801, -0.072042665, 0.041936911,
0.002861232, 0.017933468, -0.01698154, 0.006638242, -0.08375153,
-0.007220248, 0.0255507, 0.019980685, 0.013752673, 0.026000502,
-0.021134312, -0.019608471, 0.0166916, -0.021654389, 0.066402455,
0.024828862, -0.083302632, 0.042518482, -0.052439198, 0.037186281,
-0.056311172, -0.012270093), stat.lohn = c(0, -0.007558004, -0.015289567,
0, 0, -0.009609384, -0.019500305, 0, 0, -0.012458015, -0.025391532,
-0.000983501, 0, -0.00165265, -0.003313516, 0.000204576, 0, -0.004898564,
-0.009869709, 0, 0, -0.010574012, -0.021489482, 0, 0, -0.011534651,
-0.023476287, 0, 0, -0.00814845, -0.016498838, 0, 0, -0.0099856,
-0.020275409, -0.002818337, 0, -0.007212389, -0.014582736, 0,
0, -0.004121565, -0.008294445, 0, 0, -0.010766386, -0.021886884,
0, 0, -0.010179741, -0.02067574, 0, 0, -0.011797067, -0.024020039,
-0.002017983, -0.007343864, -0.007398196, -0.014962644), stat.resProp.Dwell.1 = c(0.012777325,
-0.002991775, -0.057819571, -0.00796817, -0.019386714, 0, 0.009740337,
0.005638356, -0.035148694, 0, 0.027084134, -0.160377856, 0.101169235,
-0.043007944, 0.043007944, -0.002580647, -0.015625318, 0.023347364,
0.007662873, -0.09607383, -0.024575906, 0.056733018, -0.000904568,
-0.058703392, 0.011450507, 0.007561473, 0.037879817, -0.032246,
0.042169401, -0.001796946, -0.024580209, -0.148788737, 0.082097362,
-0.000985707, -0.00098668, 0.003940892, -0.049380309, 0.005151995,
0.027371197, -0.025317808, 0.019299736, -0.047382704, -0.010604553,
0.082827084, -0.04516573, 0.003075348, 0.007139245, 0.022111454,
-0.004982571, -0.038701368, 0.018519048, -0.049096021, 0.061254226,
-0.020346582, 0.023363175, -0.00402415, -0.014213437, 0.023245109,
0.027587957), stat.carReg = c(0.022775414, 0.008073857, 0.002624717,
0.169431097, -0.144595366, 0.066716837, -0.086971929, 0.037928208,
0.071752161, -0.046824102, 0.106085873, 0.049965928, -0.057984255,
-0.091650262, 0.090732857, -0.082282389, 0.053376121, -0.044203971,
-0.022855425, 0.025856271, 0.000136493, 0.05579193, -0.293966656,
0.013645739, 0.059732986, 0.187020956, -0.145234848, 0.11041385,
-0.126539687, -0.000949877, 0.031473389, 0.020267816, -0.02180532,
-0.07175183, 0.147500145, -0.040559138, 0.008394819, 0.049045337,
-0.043050615, 0.094358754, -0.058408438, -0.005018402, -0.061717889,
0.100150837, -0.071100417, -0.084393865, 0.002854733, 0.002141389,
-0.026538398, 0.013480513, -0.046002189, -0.030495611, 0.052899746,
0.012842017, 0.064086498, 0.020757573, -0.043441298, -0.009563043,
0.048033848)), .Names = c("stat.resProp.Dwell.4", "stat.lohn",
"stat.resProp.Dwell.1", "stat.carReg"), row.names = c(NA, -59L
), class = "data.frame")
The function and my function call is:
select.lags<-function(x,y,max.lag=8) {
y<-as.numeric(y)
y.lag<-embed(y,max.lag+1)[,-1,drop=FALSE]
x.lag<-embed(x,max.lag+1)[,-1,drop=FALSE]
t<-tail(seq_along(y),nrow(y.lag))
ms=lapply(1:max.lag,function(i) lm(y[t]~y.lag[,1:i]+x.lag[,1:i]))
pvals<-mapply(function(i) anova(ms[[i]],ms[[i-1]])[2,"Pr(>F)"],max.lag:2)
ind<-which(pvals<0.05)[1]
ftest<-ifelse(is.na(ind),1,max.lag-ind+1)
aic<-as.numeric(lapply(ms,AIC))
bic<-as.numeric(lapply(ms,BIC))
structure(list(ic=cbind(aic=aic,bic=bic),pvals=pvals,
selection=list(aic=which.min(aic),bic=which.min(bic),ftest=ftest)))
}
for (i in length(datSel) ) {
for (y in length(datSel) ) {
d1<-ts(datSel[i])
d2<-ts(datSel[y])
lag <- select.lags(d1,d2,5)
}
}
As output of lag I get:
> lag
$ic
aic bic
[1,] -115.3623 -109.56679
[2,] -114.3370 -106.60972
[3,] -116.2026 -106.54350
[4,] -114.7030 -103.11210
[5,] -112.7153 -99.19253
[6,] -110.8018 -95.34721
[7,] -110.0812 -92.69477
[8,] -110.1427 -90.82446
$pvals
[1] 0.1952302 0.3017934 0.7858944 0.9176337 0.5040079 0.0604511 0.3406657
$selection
$selection$aic
[1] 3
$selection$bic
[1] 1
$selection$ftest
[1] 1
As you can see I get only 8 results back, however, my data.frame has 20 variables.
Any recommendation what I am doing wrong?
I appreciate your replies!
If you want to e.g. store the result of the AIC criterion:
lag.aic.store = matrix(NA, 4, 4)
for (i in 1:length(datSel) ) {
for (y in 1:length(datSel) ) {
d1<-ts(datSel[,i])
d2<-ts(datSel[,y])
lag <- select.lags(d1,d2,5)
lag.store.aic[i,y] = lag$selection$aic
}
}
You get 8 values in $ic because max.lag is 8, it has nothing to do with your number of variables.
Please also note that i added commas when indexing by variable for clarity and that you have to loop through 1:length(datSel) as otherwise you will only catch the last variable.
I used the periodogram() in R and I got the error message.
Warning message: object#samp.rate * seq(1, width/2) : NAs produced by
integer overflow
Here is the code I executed. And I'm using the tuneR package.
waveform <- readWave(test.wav)
maxFreq <- sampleRate/2
minFreq <- 0
periodogram(waveform, width = 131072, overlap = 0, starts = NULL, ends = NULL, taper = 0, normalize = TRUE, frqRange = c(minFreq, maxFreq))
How do I resolve this.