How to count row in every column based on threshold - r

I have the following data set:
dat <- structure(list(Probes = structure(1:6, .Label = c("1415670_at",
"1415671_at", "1415672_at", "1415673_at", "1415674_a_at", "1415675_at"
), class = "factor"), Genes = structure(c(2L, 1L, 4L, 5L, 6L,
3L), .Label = c("Atp6v0d1", "Copg1", "Dpm2", "Golga7", "Psph",
"Trappc4"), class = "factor"), bCD.ID.LN = c(1.133, 1.068, 1.01,
0.943, 1.048, 1.053), bCD.ID.LV = c(1.049, 1.006, 0.883, 0.799,
0.96, 1.104), bCD.ID.SP = c(1.124, 1.234, 1.029, 1.064, 1.118,
1.057), bCD.IP.LV = c(1.013, 1.082, 1.061, 0.982, 1.191, 1.053
), bCD.IP.SP = c(0.986, 1.102, 1.085, 0.997, 1.141, 1.041)), .Names = c("Probes",
"Genes", "bCD.ID.LN", "bCD.ID.LV", "bCD.ID.SP", "bCD.IP.LV",
"bCD.IP.SP"), row.names = c(NA, 6L), class = "data.frame")
It looks like this:
> dat
Probes Genes bCD.ID.LN bCD.ID.LV bCD.ID.SP bCD.IP.LV bCD.IP.SP
1 1415670_at Copg1 1.133 1.049 1.124 1.013 0.986
2 1415671_at Atp6v0d1 1.068 1.006 1.234 1.082 1.102
3 1415672_at Golga7 1.010 0.883 1.029 1.061 1.085
4 1415673_at Psph 0.943 0.799 1.064 0.982 0.997
5 1415674_a_at Trappc4 1.048 0.960 1.118 1.191 1.141
6 1415675_at Dpm2 1.053 1.104 1.057 1.053 1.041
What I want tod for 3rd column onward count row where the value is > 1.1
So it the end it looks like this:
bCD.ID.LN 1
bCD.ID.LV 1
bCD.ID.SP 3
bCD.IP.LV 1
bCD.IP.SP 2
How can I do that?

We can try colSums on a logical matrix based on the numeric columns in the dataset.
Count <- colSums(dat[-(1:2)] > 1.1, na.rm=TRUE)
If we need it as a data.frame
d1 <- data.frame(Cnames = names(Count), Count=unname(Count))
If it is a large dataset, converting to a logical matrix may not be memory efficient, in that case, it would be better to loop using vapply
vapply(dat[-(1:2)], function(x) sum(x > 1.1, na.rm=TRUE), 0)

Yet another version, this time using dplyr
dat %>%
select(-c(Probes, Genes)) %>%
summarise_each (funs(sum((. > 1.1))))

Here's an alternative version using lapply()
lapply(dat[-c(1:2)], function(x) length(which(x > 1.1)))
or this if you want it as a data.frame()
data.frame( lapply(dat[-c(1:2)], function(x) length(which(x > 1.1))))

Related

pivot_longer with column pairs

I am again struggling with transforming a wide df into a long one using pivot_longer The data frame is a result of power analysis for different effect sizes and sample sizes, this is how the original df looks like:
es_issue_owner es_independence es_party pwr_issue_owner_1200 pwr_independence_1200 pwr_party_1200 pwr_issue_owner_2400 pwr_independence_2400 pwr_party_2400
1 0.1 0.1 0.1 0.087 0.080 0.081 0.130 0.163 0.102
2 0.2 0.2 0.2 0.235 0.273 0.157 0.406 0.513 0.267
Or with dput:
example <- structure(list(es_issue_owner = c(0.1, 0.2), es_independence = c(0.1,
0.2), es_party = c(0.1, 0.2), pwr_issue_owner_1200 = c(0.087,
0.235), pwr_independence_1200 = c(0.08, 0.273), pwr_party_1200 = c(0.081,
0.157), pwr_issue_owner_2400 = c(0.13, 0.406), pwr_independence_2400 = c(0.163,
0.513), pwr_party_2400 = c(0.102, 0.267)), row.names = 1:2, class = "data.frame")
Each effect size (es) for three meassures ("independence", "issueowner", "party") is paired with a power calculation on a 1200 and on a 2400 sample size. This is how the output I want to get would look like based on the example above:
type es pwr value
1 independence 0.1 1200 0.080
2 issue_owner 0.1 1200 0.087
3 party 0.1 1200 0.081
4 independence 0.2 1200 0.273
5 issue_owner 0.2 1200 0.235
6 party 0.2 1200 0.157
7 independence 0.1 2400 0.163
8 issue_owner 0.1 2400 0.130
9 party 0.1 2400 0.102
10 independence 0.2 2400 0.513
11 issue_owner 0.2 2400 0.406
12 party 0.2 2400 0.267
or, with dput:
output <- structure(list(type = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L,
2L, 3L, 1L, 2L, 3L), .Label = c("independence", "issueowner",
"party"), class = "factor"), es = c(0.1, 0.1, 0.1, 0.2, 0.2,
0.2, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2), pwr = c(1200, 1200, 1200,
1200, 1200, 1200, 2400, 2400, 2400, 2400, 2400, 2400), value = c("0.080",
"0.087", "0.081", "0.273", "0.235", "0.157", "0.163", "0.130",
"0.102", "0.513", "0.406", "0.267")), out.attrs = list(dim = c(type = 3L,
es = 2L, pwr = 2L, value = 1L), dimnames = list(type = c("type=independence",
"type=issueowner", "type=party"), es = c("es=0.1", "es=0.2"),
pwr = c("pwr=1200", "pwr=2400"), value = "value=NA")), class = "data.frame", row.names = c(NA,
-12L))
As a start I tried experimenting with this:
example %>%
pivot_longer(cols = everything(),
names_pattern = "(es_[A-Za-z]+)(pwr_[A-Za-z]+_1200)(pwr_[A-Za-z]+_2400)",
# names_sep = "(?=\\d)_(?=\\d)",
names_to = c("es", "pwr_1200", "pwr_2400"),
values_to = "value")
But it did not work, so I tried from two steps, which sort of works, but the "pairing" gets messed up:
example %>%
# pivot_longer(cols = everything(),
# names_pattern = "(es_[A-Za-z]+)(pwr_[A-Za-z]+_1200)(pwr_[A-Za-z]+_2400)",
# # names_sep = "(?=\\d)_(?=\\d)",
# names_to = c("es", "pwr_1200", "pwr_2400"),
# values_to = "value")
pivot_longer(cols = contains("pwr_"),
# names_pattern = "es_pwr(.*)1200_pwr(.*)2400",
names_sep = "_(?=\\d)",
names_to = c("pwr_type", "pwr_sample"), values_to = "value") %>%
pivot_longer(cols = contains("es_"),
# names_pattern = "es_pwr(.*)1200_pwr(.*)2400",
# names_sep = "_(?=\\d)",
names_to = "es_type", values_to = "es")
I would appreciate any help!
library(tidyverse)
example %>%
pivot_longer(cols = starts_with("es"), names_to = "type", names_prefix = "es_", values_to = "es") %>%
pivot_longer(cols = starts_with("pwr"), names_to = "pwr", names_prefix = "pwr_") %>%
filter(substr(type, 1, 3) == substr(pwr, 1, 3)) %>%
mutate(pwr = parse_number(pwr)) %>%
arrange(pwr, es, type)
output
type es pwr value
1 independence 0.1 1200 0.08
2 issue_owner 0.1 1200 0.087
3 party 0.1 1200 0.081
4 independence 0.2 1200 0.273
5 issue_owner 0.2 1200 0.235
6 party 0.2 1200 0.157
7 independence 0.1 2400 0.163
8 issue_owner 0.1 2400 0.13
9 party 0.1 2400 0.102
10 independence 0.2 2400 0.513
11 issue_owner 0.2 2400 0.406
12 party 0.2 2400 0.267

transform data.frame matrix-column into columns

When using aggregate with compound function, the resulting data.frame has matrices inside columns.
ta=aggregate(cbind(precision,result,prPo)~rstx+qx+laplace,t0
,function(x) c(x=mean(x),m=min(x),M=max(x)))
ta=head(ta)
dput(ta)
structure(list(rstx = c(3, 3, 2, 3, 2, 3), qx = c(0.2, 0.25,
0.3, 0.3, 0.33, 0.33), laplace = c(0, 0, 0, 0, 0, 0), precision = structure(c(0.174583333333333,
0.186833333333333, 0.3035, 0.19175, 0.30675, 0.193666666666667,
0.106, 0.117, 0.213, 0.101, 0.22, 0.109, 0.212, 0.235, 0.339,
0.232, 0.344, 0.232), .Dim = c(6L, 3L), .Dimnames = list(NULL,
c("x", "m", "M"))), result = structure(c(-142.333333333333,
-108.316666666667, -69.1, -85.7, -59.1666666666667, -68.5666666666667,
-268.8, -198.2, -164, -151.6, -138.2, -144.8, -30.8, -12.2, -14.2,
-3.8, -12.6, -3.4), .Dim = c(6L, 3L), .Dimnames = list(NULL,
c("x", "m", "M"))), prPo = structure(c(3.68416666666667,
3.045, 2.235, 2.53916666666667, 2.0775, 2.23666666666667, 1.6,
1, 1.02, 0.54, 0.87, 0.31, 5.04, 4.02, 2.77, 3.53, 2.63, 3.25
), .Dim = c(6L, 3L), .Dimnames = list(NULL, c("x", "m", "M")))), .Names = c("rstx",
"qx", "laplace", "precision", "result", "prPo"), row.names = c(NA,
6L), class = "data.frame")
Is there a function that transform data.frame matrix-colum into columns?
Manually, for each matrix-column, column bind plus column delete works:
colnames(ta)
[1] "rstx" "qx" "laplace" "precision" "result" "prPo"
ta[,"precision"] # ta[,4]
x m M
[1,] 0.1745833 0.106 0.212
[2,] 0.1868333 0.117 0.235
[3,] 0.3035000 0.213 0.339
[4,] 0.1917500 0.101 0.232
[5,] 0.3067500 0.220 0.344
[6,] 0.1936667 0.109 0.232
#column bind + column delete
ta=cbind(ta,precision=ta[,4])
ta=ta[,-4]
colnames(ta)
[1] "rstx" "qx" "laplace" "result" "prPo" "precision.x" "precision.m"
[8] "precision.M"
ta
rstx qx laplace result.x result.m result.M prPo.x prPo.m prPo.M precision.x precision.m
1 3 0.20 0 -142.33333 -268.80000 -30.80000 3.684167 1.600000 5.040000 0.1745833 0.106
2 3 0.25 0 -108.31667 -198.20000 -12.20000 3.045000 1.000000 4.020000 0.1868333 0.117
3 2 0.30 0 -69.10000 -164.00000 -14.20000 2.235000 1.020000 2.770000 0.3035000 0.213
4 3 0.30 0 -85.70000 -151.60000 -3.80000 2.539167 0.540000 3.530000 0.1917500 0.101
5 2 0.33 0 -59.16667 -138.20000 -12.60000 2.077500 0.870000 2.630000 0.3067500 0.220
6 3 0.33 0 -68.56667 -144.80000 -3.40000 2.236667 0.310000 3.250000 0.1936667 0.109
precision.M
1 0.212
2 0.235
3 0.339
4 0.232
5 0.344
6 0.232
matrix doesn't support matrix-column. So as.matrix() transform data.frame into matrix, breaking up matrix-column.
Here is my idea:
library(tidyverse)
ta2 <- ta %>%
as.matrix() %>%
as.data.frame()
Somewhere in Stackoverflow I found a very simple solution:
cbind(ta[-ncol(ta)],ta[[ncol(ta)]])
rstx qx laplace precision.x precision.m precision.M result.x result.m result.M x m
1 3 0.20 0 0.1745833 0.1060000 0.2120000 -142.33333 -268.80000 -30.80000 3.684167 1.60
2 3 0.25 0 0.1868333 0.1170000 0.2350000 -108.31667 -198.20000 -12.20000 3.045000 1.00
3 2 0.30 0 0.3035000 0.2130000 0.3390000 -69.10000 -164.00000 -14.20000 2.235000 1.02
4 3 0.30 0 0.1917500 0.1010000 0.2320000 -85.70000 -151.60000 -3.80000 2.539167 0.54
5 2 0.33 0 0.3067500 0.2200000 0.3440000 -59.16667 -138.20000 -12.60000 2.077500 0.87
6 3 0.33 0 0.1936667 0.1090000 0.2320000 -68.56667 -144.80000 -3.40000 2.236667 0.31
M
1 5.04
2 4.02
3 2.77
4 3.53
5 2.63
6 3.25
Just that!

How to perform LOOCV in linear SVM in R e1071 and create contingency table

I have a data frame testdata
X95 X96 X97 X98 X99 X100 status
1 0.0096 0.0697 0.0021 0.0286 0.0088 0.0195 1
2 0.0133 0.0658 0.0022 0.0215 0.0114 0.0186 1
3 0.0091 0.0671 0.0027 0.0240 0.0101 0.0171 1
4 0.0095 0.0656 0.0011 0.0363 0.0092 0.0130 0
5 0.0081 0.0726 0.0018 0.0243 0.0095 0.0187 0
6 0.0088 0.0720 0.0015 0.0253 0.0094 0.0194 0
dput(testdata)
structure(list(X95 = c(0.0096, 0.0133, 0.0091, 0.0095, 0.0081,
0.0088), X96 = c(0.0697, 0.0658, 0.0671, 0.0656, 0.0726, 0.072
), X97 = c(0.0021, 0.0022, 0.0027, 0.0011, 0.0018, 0.0015), X98 = c(0.0286,
0.0215, 0.024, 0.0363, 0.0243, 0.0253), X99 = c(0.0088, 0.0114,
0.0101, 0.0092, 0.0095, 0.0094), X100 = c(0.0195, 0.0186, 0.0171,
0.013, 0.0187, 0.0194), status = c(1, 1, 1, 0, 0, 0)), .Names = c("X95",
"X96", "X97", "X98", "X99", "X100", "status"), class = "data.frame", row.names = c(NA,
6L))
I want to create a linear SVM which accurately classifies the status as either 0 or 1 and can predict future observations using a leave one out method.
This is what I currently have
y.svm <- rep(NA, nrow(testdata))
for (i in 1:nrow(testdata)) {
testset <- testdata[i,]
trainset <- testdata[-i,]
model.svm <-
svm(
testdata[-i, 6] ~ .,
data = trainset,
type = "C-classification",
kernel = "linear"
)
y.svm[i] <- as.character(predict(model.svm, testset))
}
I would like to be able to generate a contingency table showing me TP, FP, FN, and TN values. How can this be done?
Thanks!
You can use this solution:
library(gmodels)
actual <- testdata[,7]
predicted <- y.svm
#basic R solution
table(predicted,actual)
#Output similar to what users of SPSS or SAS expects
CrossTable(predicted,actual)

What is the reason of the error: non-numeric argument to binary operator in R

I am trying to make grouped (by year) bar-plot with separate columns based on anomaly data (+/-).
I have used following script and data in R.
mydata <- read.csv("F:/MOD13A1_NDVI_500/Mod_ndvi_500_excel/ndvi_anomaly.csv", head=TRUE)
mydata
NZ X2000 X2001 X2002 X2003 X2004 X2005 X2006 X2007 X2008
1 High_mountain 0.007 -0.003 -0.002 -0.016 0.011 0.016 -0.007 0.000 -0.003
2 Taiga -0.002 0.018 -0.006 -0.022 0.018 0.004 -0.016 0.025 0.003
3 Forest_steppe 0.004 0.011 -0.044 -0.008 0.009 0.003 -0.004 -0.005 -0.001
4 Steppe 0.001 -0.016 -0.002 0.007 -0.022 -0.004 -0.017 -0.053 0.000
par(xpd=T, mar=par()$mar+c(0,0,0,6))
barplot(as.matrix(mydata[1:6,]), beside=T)
It returns error:
Error in -0.01 * height : non-numeric argument to binary operator
What is the reason of this kind of error? I have found several questions with error non-numeric argument to binary operator in this site, but each cases were different. I thought it's cause might be negative(-) values. How to avoid from this error?
It doesn't have to do with the negative values. You are mixing numeric and character types in a matrix which coverts everything to character. Observe
as.matrix(mydata[,1:6])
# NZ X2000 X2001 X2002 X2003 X2004
# 1 "High_mountain" " 0.007" "-0.003" "-0.002" "-0.016" " 0.011"
# 2 "Taiga" "-0.002" " 0.018" "-0.006" "-0.022" " 0.018"
# 3 "Forest_steppe" " 0.004" " 0.011" "-0.044" "-0.008" " 0.009"
# 4 "Steppe" " 0.001" "-0.016" "-0.002" " 0.007" "-0.022"
YOu can't really make a barplot with a much of character values. Try leaving out the names
barplot(as.matrix(mydata[,2:6]), beside=T)
to get
This is assuming your mydata ended up looking something like
mydata<-structure(list(NZ = structure(c(2L, 4L, 1L, 3L), .Label = c("Forest_steppe",
"High_mountain", "Steppe", "Taiga"), class = "factor"), X2000 = c(0.007,
-0.002, 0.004, 0.001), X2001 = c(-0.003, 0.018, 0.011, -0.016
), X2002 = c(-0.002, -0.006, -0.044, -0.002), X2003 = c(-0.016,
-0.022, -0.008, 0.007), X2004 = c(0.011, 0.018, 0.009, -0.022
), X2005 = c(0.016, 0.004, 0.003, -0.004), X2006 = c(-0.007,
-0.016, -0.004, -0.017), X2007 = c(0, 0.025, -0.005, -0.053),
X2008 = c(-0.003, 0.003, -0.001, 0)), .Names = c("NZ", "X2000",
"X2001", "X2002", "X2003", "X2004", "X2005", "X2006", "X2007",
"X2008"), class = "data.frame", row.names = c("1", "2", "3",
"4"))

R: how to use lapply or for to loop over all values of multiple strings

I have a dataset with one variable categorized according to 200 alphabetic codes code1<-c("AAA","BBB","DDD","EEE","FFF"), 2 alphabetic codes code2<-c("Yyy","Zzzzz"), and 41 numeric codes code3<-seq(1970,2011,1).
I have a function that produces a 8-number vector from each subset of the data according to unique values of code1,code2,code3. So, I would like to run a few lines of code on each subset of these data.
The complete list is imported as a data frame data, and I currently work by extracting each subset of the data from the data frame, analyzing it, and then saving the output
The problem is that it would be cumbersome to loop through all the values of code1 and code2 and code3 according to this scheme, and it would be much better to produce a single output data frame as well, with the 8 numbers saved alongside the unique values of code1, code2, and code3 which produced them.
I'm sure that this could be done without resorting to loops over values of code1-code3 and assign(), but being a neophyte I'm afraid I can't quite put it together.
Thanks -- E
Additional data:
This is what the output vector from the function I am running looks like, manually subset for one series:
output1<-fxn(data$input,[which(data$code1=='AAA'&data$code2=='Yyy'&data$code3==1990)])
output2<-fxn2(output1)
str(output2$out[,2]): num [1:8] 0.009 0.648 0.304 0.004 0.445 ...
output2$out[,2]: [1] 0.009 0.648 0.304 0.004 0.445 36.720 0.000 1.103
Additional data:
In response to requests, this faked output dataset approximates what I'm looking for-- each row of the file is from one completed run of the function fxn2. The first 8 columns are output by the function; the last 3 columns are added to distinguish unique values of code1,code2,code3:
> head(data)
X.x1 x2 x3 x4 x5 x6 x7 x8 code3 code2 code1
1 0.008 0.595 0.185 0.005 0.173 36.744 0 1.102 1970 male BGR
2 0.004 0.242 0.276 0.005 0.348 46.017 0 1.108 1971 male BGR
3 0.002 0.553 0.242 0.005 0.247 35.424 0 1.107 1972 male BGR
4 0.005 0.593 0.270 0.004 0.312 43.701 0 1.105 1973 male BGR
5 0.009 0.660 0.217 0.005 0.266 37.955 0 1.103 1974 male BGR
6 0.006 0.347 0.297 0.005 0.411 50.959 0 1.108 1975 male BGR
> dput(head(data))
structure(list(X.x1 = c(0.008, 0.004, 0.002, 0.005, 0.009, 0.006
), x2 = c(0.595, 0.242, 0.553, 0.593, 0.66, 0.347), x3 = c(0.185,
0.276, 0.242, 0.27, 0.217, 0.297), x4 = c(0.005, 0.005, 0.005,
0.004, 0.005, 0.005), x5 = c(0.173, 0.348, 0.247, 0.312, 0.266,
0.411), x6 = c(36.744, 46.017, 35.424, 43.701, 37.955, 50.959
), x7 = c(0, 0, 0, 0, 0, 0), x8 = c(1.102, 1.108, 1.107, 1.105,
1.103, 1.108), year = 1970:1975, sex = structure(c(1L, 1L, 1L,
1L, 1L, 1L), .Label = "male", class = "factor"), iso3 = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = "BGR", class = "factor")), .Names = c("X.x1",
"x2", "x3", "x4", "x5", "x6", "x7", "x8", "year", "sex", "iso3"
), row.names = c(NA, 6L), class = "data.frame")
I think you can simplify your code by doing this. If you give more details about desired output, I will update the answer accordingly.
code1<-c("AAA","BBB","DDD","EEE","FFF")
code2<-c("Yyy","Zzzzz")
code3<-seq(1970,2011,1)
params <- expand.grid(code1, code2, code3)
names(params) <- c('code1', 'code2', 'code3')
myFunc <- function(code1, code2, code3) {
##add your function code here.
...
...
return(output2$out[,2])
}
LL <- mapply(FUN=myFunc, code1 = params$code1, code2 = params$code2, code3 = params$code3)
result <- split(LL, rep(1:ncol(LL), each = nrow(LL)))
result <- do.call(rbind, result)
result <- cbind(result, params) result <- cbind(result, params)

Resources