Related
I want to perform log-normalisation on myy data, and since some enteries are 0.0000 in my dataframe I want to put some very small value of the order 1e-7, so that after performing log normalisation, I don't get -Inf as the value stored.
I'm writing the following code in my console:
for(i in 1:nrow(genes_rpkm_rep_colN))
{
for(j in 1:ncol(genes_rpkm_rep_colN))
{
if(genes_rpkm_rep_colN[i,j] == 0.0000000){
genes_rpkm_rep_colN[i,j] <- 1e-7
}
}
}
I'm encountering the following error while running this piece of code:
Error in if (genes_rpkm_rep_colN[i, j] == 0) { :
missing value where TRUE/FALSE needed
I've put a true/false boolean condition in the if() statement, yet the error.
I'm share a small piece of my data below so that you can have a look and check that my data isn't the one causing the error.
> dput(genes_rpkm_rep_colN[1:10,1:30])
structure(list(X42MGBA_CENTRAL_NERVOUS_SYSTEM = c(0.0093774,
3.99494, 0.0208305, 0.0065619, 0.0084466, 0.0085095, 0.0174268,
0.0233318, 0.0530461, 0.0699613), X8MGBA_CENTRAL_NERVOUS_SYSTEM = c(0,
4.6815, 0.0188461, 0.0118735, 0.0152838, 0.0230965, 0.0157667,
0.0070364, 0.0319951, 0.101274), A1207_CENTRAL_NERVOUS_SYSTEM = c(0.0432576,
2.96619, 0.0137272, 0.0259454, 0, 0.0336463, 0.0114842, 0, 0.0553488,
7.44429), A172_CENTRAL_NERVOUS_SYSTEM = c(0.0194699, 2.92748,
0.0216248, 0.0272483, 0, 0.0176679, 0.0180913, 0.0080738, 0.0665414,
0.0387354), AM38_CENTRAL_NERVOUS_SYSTEM = c(0.0115334, 2.69758,
0.0085399, 0.0322822, 0.0069257, 0, 0.0357226, 0.0063769, 0.0471195,
0.271525), CAS1_CENTRAL_NERVOUS_SYSTEM = c(0.10065, 4.8228, 0.0958194,
0.0469533, 0.0518052, 0.069588, 0.0979765, 0.0556501, 0.117486,
0.147798), CCFSTTG1_CENTRAL_NERVOUS_SYSTEM = c(0.0440228, 6.04641,
0.019558, 0.0246441, 0.0158612, 0.0079897, 0.0163623, 0.0073022,
0.0601819, 0.118238), CH157MN_CENTRAL_NERVOUS_SYSTEM = c(0.0120244,
3.41429, 0.0053421, 0.0235595, 0.0173293, 0.0043646, 0.0044692,
0.0139616, 0.0408118, 0.181811), D283MED_CENTRAL_NERVOUS_SYSTEM = c(0.0638066,
5.12254, 0.0250124, 0.057781, 0.0135231, 0.0272476, 0.0279006,
0.0124515, 0.0583877, 0.343494), D341MED_CENTRAL_NERVOUS_SYSTEM = c(0.0418829,
4.97037, 0.0348888, 0.0219808, 0.0377255, 0.0380065, 0.058376,
0.0217101, 0.0937822, 1.3228), DAOY_CENTRAL_NERVOUS_SYSTEM = c(0.0277923,
4.16543, 0.051447, 0.0194477, 0.016689, 0.0336267, 0.0602569,
0.0460997, 0.0633229, 0.317934), DBTRG05MG_CENTRAL_NERVOUS_SYSTEM = c(0.062215,
4.22423, 0.0307115, 0.0580469, 0.0622661, 0.012546, 0.0128466,
0.0171996, 0.72017, 0.192542), DKMG_CENTRAL_NERVOUS_SYSTEM = c(0.0061458,
2.58862, 0.0546082, 0.0086011, 0.0332147, 0.0446161, 0.0571067,
0.0866511, 0.0985031, 0.128385), GAMG_CENTRAL_NERVOUS_SYSTEM = c(0.0638691,
4.18606, 0.023646, 0.0595902, 0.0095882, 0.0676175, 0.0296734,
0.0264853, 0.0953419, 1.13302), GB1_CENTRAL_NERVOUS_SYSTEM = c(0.0332071,
4.09682, 0.0122941, 0.0232368, 0.0199406, 0.0100446, 0.0205706,
0.036721, 0.15393, 8.77573), GI1_CENTRAL_NERVOUS_SYSTEM = c(0.0236971,
2.99664, 0.0315838, 0.0132657, 0.008538, 0.0344062, 0.0528461,
0.0196535, 0.0826642, 0.132007), GMS10_CENTRAL_NERVOUS_SYSTEM = c(0.112392,
3.29799, 0, 0.0058257, 0.007499, 0.0151096, 0.0232076, 0.0069047,
0.0392457, 0.0786757), GOS3_CENTRAL_NERVOUS_SYSTEM = c(0.0785394,
3.06583, 0.0793018, 0.0349735, 0.0128625, 0.0194374, 0.0464408,
0.0207256, 0.149777, 0.205972), H4_CENTRAL_NERVOUS_SYSTEM = c(0.0412065,
5.11983, 0.0416065, 0.0209705, 0.0337421, 0.0543895, 0.0417697,
0.018641, 0.0953581, 0.432261), HS683_CENTRAL_NERVOUS_SYSTEM = c(0.0395662,
4.82034, 0.0087891, 0.016612, 0.0285111, 0, 0.0294118, 0.0164074,
0.0708759, 0.240087), IOMMLEE_CENTRAL_NERVOUS_SYSTEM = c(0.0089568,
3.07764, 0, 0.0188027, 0.0080677, 0.0406391, 0.0083226, 0.0037142,
0.0295557, 0.178196), KALS1_CENTRAL_NERVOUS_SYSTEM = c(0.0212606,
3.22541, 0.0094454, 0.0059509, 0.0076601, 0.0154343, 0.0790207,
0.0105796, 0.0440979, 0.135353), KG1C_CENTRAL_NERVOUS_SYSTEM = c(0.0306739,
3.25635, 0.0292018, 0.0674589, 0.007894, 0.0397642, 0.0814343,
0.0036343, 0.107415, 0.248463), KNS42_CENTRAL_NERVOUS_SYSTEM = c(0.0377038,
2.77745, 0.0598239, 0.0075381, 0.0097032, 0, 0, 0.0044672, 0.0660162,
0.128592), KNS60_CENTRAL_NERVOUS_SYSTEM = c(0.0308664, 2.75686,
0.0571377, 0.0359982, 0, 0.0186731, 0.0095603, 0, 0.0606269,
0.214931), KNS81_CENTRAL_NERVOUS_SYSTEM = c(0.0376095, 4.39526,
0.041772, 0.0328967, 0.0169382, 0.0341286, 0.0349465, 0.003899,
0.0864295, 0.0841772), KS1_CENTRAL_NERVOUS_SYSTEM = c(0.0113846,
1.91478, 0.0252892, 0.0318656, 0.0102545, 0.0413236, 0.0317354,
0.004721, 0.0295168, 0.18686), LN18_CENTRAL_NERVOUS_SYSTEM = c(0.0159147,
4.40237, 0, 0.0371213, 0.0191134, 0.0192557, 0.0197172, 0.0219985,
0.0600177, 0.358841), LN215_CENTRAL_NERVOUS_SYSTEM = c(0.0188976,
6.19285, 0.0209891, 0, 0, 0.0257228, 0.0175595, 0.0274276, 0.05345,
0.422964), LN229_CENTRAL_NERVOUS_SYSTEM = c(0.0042589, 4.66724,
0.0189209, 0.0059603, 0.0153445, 0, 0.0316585, 0.0070643, 0.0602291,
0.169461)), row.names = c("DDX11L1", "WASH7P", "MIR1302-11",
"FAM138A", "OR4G4P", "OR4G11P", "OR4F5", "RP11-34P13.7", "CICP27",
"AL627309.1"), class = "data.frame")
Maybe try this without a loop:
library(dplyr)
df |>
mutate(across(everything(), ~ifelse(.x == 0.0000000, 1e-7, .x)),
across(everything(), ~log(.x), .names = "log_{col}"))
I trained a BERT based encoder decoder model (EncoderDecoderModel) named ed_model with HuggingFace's transformers module.
I used the BertTokenizer named as input_tokenizer
I tokenized the input with:
txt = "Some wonderful sentence to encode"
inputs = input_tokenizer(txt, return_tensors="pt").to(device)
print(inputs)
The output clearly shows that a input_ids is the return dict
{'input_ids': tensor([[ 101, 5660, 7975, 2127, 2053, 2936, 5061, 102]], device='cuda:0'), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}
But when I try to predict, I get this error:
ed_model.forward(**inputs)
ValueError: You have to specify either input_ids or inputs_embeds
Any ideas ?
Well, apparently this is a known issue, for example: This issue of T5
The problem is that there's probably a renaming procedure in the code, since we use a encoder-decoder architecture we have 2 types of input ids.
The solution is to explicitly specify the type of input id
ed_model.forward(decoder_input_ids=inputs['input_ids'],**inputs)
I wish it was documented somewhere, but now you know :-)
A naive approach to writing custom expectations from existing expectations is:
expect_between <- function (x, lo, hi) {
expect_lte(lo, x)
expect_lte(x, hi)
}
but this doesn't work with expect_failure, since expect_failure captures the first expectation.
expect_between(0.5, 0, 1) # PASSES
expect_between(-99, 0, 1) # FAILS
expect_between(99, 0, 1) # FAILS
expect_failure(expect_between(-99, 0, 1)) # PASSES
expect_failure(expect_between(99, 0, 1)) # FAILS <--- the problem
What is the proper way to compose expectations when writing user-defined expectations, so that they still play well with expect_failure?
I created the following function to determine the lag of two variables.
However, this function takes only two parameters, and I would like to run it over my whole dataset:
datSel <- structure(list(stat.resProp.Dwell.4 = c(0.000887705, 0.007954085,
-0.025859667, 0.024097552, 0.114052787, 0.023329207, 0.042143181,
-0.092587287, -0.004050228, -0.001624696, 0.020121403, -0.100502922,
0.057354185, 0.025463388, 0.037409854, 0.001561281, -0.028482938,
-0.004827041, 0.014411779, -0.029034298, 0.021053409, -0.067963182,
0.032070259, -0.038091783, 0.039751534, 0.027802281, -0.027802281,
-0.013355791, 0.009201236, -0.073403679, 0.021277398, -0.033901552,
0.012624153, -0.065733979, 0.032017801, -0.072042665, 0.041936911,
0.002861232, 0.017933468, -0.01698154, 0.006638242, -0.08375153,
-0.007220248, 0.0255507, 0.019980685, 0.013752673, 0.026000502,
-0.021134312, -0.019608471, 0.0166916, -0.021654389, 0.066402455,
0.024828862, -0.083302632, 0.042518482, -0.052439198, 0.037186281,
-0.056311172, -0.012270093), stat.lohn = c(0, -0.007558004, -0.015289567,
0, 0, -0.009609384, -0.019500305, 0, 0, -0.012458015, -0.025391532,
-0.000983501, 0, -0.00165265, -0.003313516, 0.000204576, 0, -0.004898564,
-0.009869709, 0, 0, -0.010574012, -0.021489482, 0, 0, -0.011534651,
-0.023476287, 0, 0, -0.00814845, -0.016498838, 0, 0, -0.0099856,
-0.020275409, -0.002818337, 0, -0.007212389, -0.014582736, 0,
0, -0.004121565, -0.008294445, 0, 0, -0.010766386, -0.021886884,
0, 0, -0.010179741, -0.02067574, 0, 0, -0.011797067, -0.024020039,
-0.002017983, -0.007343864, -0.007398196, -0.014962644), stat.resProp.Dwell.1 = c(0.012777325,
-0.002991775, -0.057819571, -0.00796817, -0.019386714, 0, 0.009740337,
0.005638356, -0.035148694, 0, 0.027084134, -0.160377856, 0.101169235,
-0.043007944, 0.043007944, -0.002580647, -0.015625318, 0.023347364,
0.007662873, -0.09607383, -0.024575906, 0.056733018, -0.000904568,
-0.058703392, 0.011450507, 0.007561473, 0.037879817, -0.032246,
0.042169401, -0.001796946, -0.024580209, -0.148788737, 0.082097362,
-0.000985707, -0.00098668, 0.003940892, -0.049380309, 0.005151995,
0.027371197, -0.025317808, 0.019299736, -0.047382704, -0.010604553,
0.082827084, -0.04516573, 0.003075348, 0.007139245, 0.022111454,
-0.004982571, -0.038701368, 0.018519048, -0.049096021, 0.061254226,
-0.020346582, 0.023363175, -0.00402415, -0.014213437, 0.023245109,
0.027587957), stat.carReg = c(0.022775414, 0.008073857, 0.002624717,
0.169431097, -0.144595366, 0.066716837, -0.086971929, 0.037928208,
0.071752161, -0.046824102, 0.106085873, 0.049965928, -0.057984255,
-0.091650262, 0.090732857, -0.082282389, 0.053376121, -0.044203971,
-0.022855425, 0.025856271, 0.000136493, 0.05579193, -0.293966656,
0.013645739, 0.059732986, 0.187020956, -0.145234848, 0.11041385,
-0.126539687, -0.000949877, 0.031473389, 0.020267816, -0.02180532,
-0.07175183, 0.147500145, -0.040559138, 0.008394819, 0.049045337,
-0.043050615, 0.094358754, -0.058408438, -0.005018402, -0.061717889,
0.100150837, -0.071100417, -0.084393865, 0.002854733, 0.002141389,
-0.026538398, 0.013480513, -0.046002189, -0.030495611, 0.052899746,
0.012842017, 0.064086498, 0.020757573, -0.043441298, -0.009563043,
0.048033848)), .Names = c("stat.resProp.Dwell.4", "stat.lohn",
"stat.resProp.Dwell.1", "stat.carReg"), row.names = c(NA, -59L
), class = "data.frame")
The function and my function call is:
select.lags<-function(x,y,max.lag=8) {
y<-as.numeric(y)
y.lag<-embed(y,max.lag+1)[,-1,drop=FALSE]
x.lag<-embed(x,max.lag+1)[,-1,drop=FALSE]
t<-tail(seq_along(y),nrow(y.lag))
ms=lapply(1:max.lag,function(i) lm(y[t]~y.lag[,1:i]+x.lag[,1:i]))
pvals<-mapply(function(i) anova(ms[[i]],ms[[i-1]])[2,"Pr(>F)"],max.lag:2)
ind<-which(pvals<0.05)[1]
ftest<-ifelse(is.na(ind),1,max.lag-ind+1)
aic<-as.numeric(lapply(ms,AIC))
bic<-as.numeric(lapply(ms,BIC))
structure(list(ic=cbind(aic=aic,bic=bic),pvals=pvals,
selection=list(aic=which.min(aic),bic=which.min(bic),ftest=ftest)))
}
for (i in length(datSel) ) {
for (y in length(datSel) ) {
d1<-ts(datSel[i])
d2<-ts(datSel[y])
lag <- select.lags(d1,d2,5)
}
}
As output of lag I get:
> lag
$ic
aic bic
[1,] -115.3623 -109.56679
[2,] -114.3370 -106.60972
[3,] -116.2026 -106.54350
[4,] -114.7030 -103.11210
[5,] -112.7153 -99.19253
[6,] -110.8018 -95.34721
[7,] -110.0812 -92.69477
[8,] -110.1427 -90.82446
$pvals
[1] 0.1952302 0.3017934 0.7858944 0.9176337 0.5040079 0.0604511 0.3406657
$selection
$selection$aic
[1] 3
$selection$bic
[1] 1
$selection$ftest
[1] 1
As you can see I get only 8 results back, however, my data.frame has 20 variables.
Any recommendation what I am doing wrong?
I appreciate your replies!
If you want to e.g. store the result of the AIC criterion:
lag.aic.store = matrix(NA, 4, 4)
for (i in 1:length(datSel) ) {
for (y in 1:length(datSel) ) {
d1<-ts(datSel[,i])
d2<-ts(datSel[,y])
lag <- select.lags(d1,d2,5)
lag.store.aic[i,y] = lag$selection$aic
}
}
You get 8 values in $ic because max.lag is 8, it has nothing to do with your number of variables.
Please also note that i added commas when indexing by variable for clarity and that you have to loop through 1:length(datSel) as otherwise you will only catch the last variable.
When I enter the following commands directly into the R console
library("xts")
mySeries <- xts(c(1.0, 2.0, 3.0, 5.0, 6.0), order.by=c(ISOdatetime(2001, 1, 1, 0, 0, 0), ISOdatetime(2001, 1, 2, 0, 0, 0), ISOdatetime(2001, 1, 3, 0, 0, 0), ISOdatetime(2001, 1, 4, 0, 0, 0), ISOdatetime(2001, 1, 5, 0, 0, 0)))
resultingSeries <- to.monthly(mySeries)
resultingSeries
I will get an output like this
mySeries.Open mySeries.High mySeries.Low mySeries.Close
Jan 2001 1 6 1 6
When I look into the attributes, I see the following output
attributes(resultingSeries)
$dim
[1] 1 4
$dimnames
$dimnames[[1]]
NULL
$dimnames[[2]]
[1] "mySeries.Open" "mySeries.High" "mySeries.Low" "mySeries.Close"
$index
[1] 978307200
attr(,"tclass")
[1] "yearmon"
$tclass
[1] "POSIXct" "POSIXt"
$tzone
[1] ""
$class
[1] "xts" "zoo"
$.indexCLASS
[1] "yearmon"
This is the same I get in Java. I'm wondering where the magic happens so that I see the nice output I get in R. I have no access to the event loop, since I'm using JRI like this (since, it's the recommended way and simplifies error handling):
REngine engine = REngine.engineForClass("org.rosuda.REngine.JRI.JRIEngine");
REXP result = engine.parseAndEval(...)
/edit
In Java I execute each command from above as follows:
REXP result = engine.parseAndEval("resultingSeries") // or any other command
What I get is
org.rosuda.REngine.REXPDouble#4ac66122+[12]
The payload being doubles: 1, 6, 1, 6
The attributes are the same as specified above.
Now R does some magic to display the output above. Is there a way I can get the same output without having to create it manually by myself? Where's the implementation stored, that R gets the above mentioned output?
Here is a piece of code that will work, here i extracted the first element of the field mySeries.Open from the object resultingSeries (which i converted to a data frame) which is equal to 1, notice that you can't pass all of the resultingSeries object strait into Java, you will need to break it down.
package stackoverflow;
import org.rosuda.JRI.REXP;
import org.rosuda.JRI.Rengine;
/**
*
* #author yschellekens
*/
public class StackOverflow {
public static void main(String[] args) throws Exception {
String[] Rargs = {"--vanilla"};
Rengine rengine = new Rengine( Rargs, false, null);
rengine.eval("library('xts')");
rengine.eval("mySeries <- xts(c(1.0, 2.0, 3.0, 5.0, 6.0), order.by=c(ISOdatetime(2001, 1, 1, 0, 0, 0), ISOdatetime(2001, 1, 2, 0, 0, 0), ISOdatetime(2001, 1, 3, 0, 0, 0), ISOdatetime(2001, 1, 4, 0, 0, 0), ISOdatetime(2001, 1, 5, 0, 0, 0)))");
rengine.eval("resultingSeries <- to.monthly(mySeries)");
rengine.eval("resultingSeries<-as.data.frame(resultingSeries)");
REXP result= rengine.eval("resultingSeries$mySeries.Open");
System.out.println("Greeting from R: "+result.asDouble());
}
}
And the Java output:
run:
Greeting from R: 1.0
I figured out the following workaround. The solution is far from perfect.
R offers a command to save its console output as characters vector.
capture.output( {command} )
We can access the output using
REXPString s = rengine.parseAndEval("capture.output( to.monthly(mySeries))")
String[] output = result.asStrings()
The variable output will contain all output lines
[0] mySeries.Open mySeries.High mySeries.Low mySeries.Close
[1]Jan 2001 1 6 1 6
Alternatively you coud use JRIEngine and attack yourself to the event loop, which it did not want in my case (due to the more complicated error handling).