While constructing a data-frame, columns are replicated if lengths differ.
> data.frame(x = c(1,2), y = NA_integer_)
x y
1 1 NA
2 2 NA
However, when I try to do this with bit64::NA_integer64_, I get an error. Does anyone know what could be happening? rep() works if it is called separately on bit64::NA_integer64_.
> data.frame(x = c(1,2), y = bit64::NA_integer64_)
Error in data.frame(x = c(1, 2), y = bit64::NA_integer64_) :
arguments imply differing number of rows: 2, 1
> rep(bit64::NA_integer64_, 2)
integer64
[1] <NA> <NA>
data.frame will only recycle:
Vector with no attributes other than names
factor
AsIs character
Date
POSIXct
tibble doesn't have this problem.
tibble::tibble(x = c(1,2), y = bit64::NA_integer64_)
#> # A tibble: 2 x 2
#> x y
#> <dbl> <int64>
#> 1 1 NA
#> 2 2 NA
Here is the relevant snippet from data.frame
for (i in seq_len(n)[nrows < nr]) {
xi <- vlist[[i]]
if (nrows[i] > 0L && (nr%%nrows[i] == 0L)) {
xi <- unclass(xi)
fixed <- TRUE
for (j in seq_along(xi)) {
xi1 <- xi[[j]]
if (is.vector(xi1) || is.factor(xi1))
xi[[j]] <- rep(xi1, length.out = nr)
else if (is.character(xi1) && inherits(xi1, "AsIs"))
xi[[j]] <- structure(rep(xi1, length.out = nr),
class = class(xi1))
else if (inherits(xi1, "Date") || inherits(xi1, "POSIXct"))
xi[[j]] <- rep(xi1, length.out = nr)
else {
fixed <- FALSE
break
}
}
if (fixed) {
vlist[[i]] <- xi
next
}
}
stop(gettextf("arguments imply differing number of rows: %s",
paste(unique(nrows), collapse = ", ")), domain = NA)
}
Related
I have data that looks like this :
is_severe encoding sn_id
6 1 1 chr1 17689
7 0 2 chr1 17689
8 1 1 chr1 17689
9 1 2 chr1 69511
10 1 2 chr1 69511
11 1 1 chr1 69511
12 0 1 chr1 69511
I performed a statistical test on every "group" of values based on the sn_id column.
this is the function for the statistical test:
catt <-
function(y, x, score = c(0, 1, 2)) {
miss <- unique(c(which(is.na(y)), which(is.na(x))))
n.miss <- length(miss)
if(n.miss > 0) {
y <- y[-miss]
x <- x[-miss]
}
if(!all((y == 0) | (y == 1)))
stop("y should be only 0 or 1.")
if(!all((x == 0) | (x == 1) |(x == 2)))
stop("x should be only 0, 1 or 2.")
ca <- x [y == 1]
co <- x [y == 0]
htca <- table(ca)
htco <- table(co)
A <- matrix(0, 2, 3)
colnames(A) <- c(0, 1, 2)
rownames(A) <- c(0, 1)
A[1, names(htca)] <- htca
A[2, names(htco)] <- htco
ptt <- prop.trend.test(A[1, ], colSums(A), score = score)
p.value = as.numeric(ptt$p.value)
res=p.value
return(res)}
and i performed it on the groups of snp_id using the by function:
send=by(merged_df_normal,merged_df_normal$snp_id, function (merged_df_normal) {catt(merged_df_normal$is_sever_int,merged_df_normal$encoding)})
and got these results for example :
merged_df_normal$snp_id: chr11441806
[1] 0.6274769
---------------------------------------------------------------------
merged_df_normal$snp_id: chr1144192891
[1] NA
i wanted to transform this into a data frame which will look like this:
snp_id pvalue
chr11441806 0.6274769
chr1144192891 NA
I tried this :
do.call(rbind,list(send)
and it returned a matrix
that looks like this:
chr11441806 chr1144192891
0.6274769 NA
I had to edit the function after accepting an answer :
catt_2 <-
function(y, x, score = c(0, 1, 2)) {
miss <- unique(c(which(is.na(y)), which(is.na(x))))
n.miss <- length(miss)
if(n.miss > 0) {
y <- y[-miss]
x <- x[-miss]
}
if(!all((y == 0) | (y == 1)))
stop("y should be only 0 or 1.")
if(!all((x == 0) | (x == 1) |(x == 2)))
stop("x should be only 0, 1 or 2.")
ca <- x [y == 1]
co <- x [y == 0]
htca <- table(ca)
htco <- table(co)
A <- matrix(0, 2, 3)
colnames(A) <- c(0, 1, 2)
rownames(A) <- c(0, 1)
A[1, names(htca)] <- htca
A[2, names(htco)] <- htco
ptt <- prop.trend.test(A[1, ], colSums(A), score = score)
res <- list(
chisq = as.numeric(ptt$statistic),
p.value = as.numeric(ptt$p.value)
)
return(res)
}
and now the results are :
send=by(merged_df_normal,merged_df_normal$snp_id, function (merged_df_normal) {catt_2(merged_df_normal$is_sever,merged_df_normal$encoding)})
merged_df_normal$snp_id: chr11007252
$chisq
[1] NA
$p.value
[1] NA
------------------------------------------------------------------------
merged_df_normal$snp_id: chr1100731820
$chisq
[1] 0.9111779
$p.value
[1] 0.3398021
and what I would like it to be is:
snp_id pvalue chisq
chr11441806 0.6274769 0.9111779
chr1144192891 NA NA
the answer:
library(data.table)
setDT(merged_df_normal)
merged_df_normal[,.(p.value=catt(is_sever,encoding)),snp_id]
worked really well for getting just the p.value but is there a way to edit the above answer and add a new column chisq? thank you for the help the previous answer
I believe you can just apply catt() to each group of sn_id. Let's say your original data is called df. Then, you can do the following:
library(data.table)
setDT(df)
df[,.(p.value=catt(is_severe,encoding)),sn_id]
You need to adjust your function so that it handles sn_id groups that don't have sufficient data; in your example data frame, catt() only runs without error on sn_id == chr1 69511..
In general, however, the output will look like this, with one row in the frame for each sn_id value
sn_id p.value
<char> <num>
1: chr1 69511 0.2482131
i have data frame that looks like this :
is severe encoding sn_id
1 1 1
0 2 1
1 2 2
0 1 2
1 1 2
im using on by function this function :
catt <-
function(y, x, score = c(0, 1, 2)) {
miss <- unique(c(which(is.na(y)), which(is.na(x))))
n.miss <- length(miss)
if(n.miss > 0) {
y <- y[-miss]
x <- x[-miss]
}
if(!all((y == 0) | (y == 1)))
stop("y should be only 0 or 1.")
if(!all((x == 0) | (x == 1) |(x == 2)))
stop("x should be only 0, 1 or 2.")
ca <- x [y == 1]
co <- x [y == 0]
htca <- table(ca)
htco <- table(co)
A <- matrix(0, 2, 3)
colnames(A) <- c(0, 1, 2)
rownames(A) <- c(0, 1)
A[1, names(htca)] <- htca
A[2, names(htco)] <- htco
ptt <- prop.trend.test(A[1, ], colSums(A), score = score)
#list(#"2x3-table" = A,
#chisq = as.numeric(ptt$statistic),
#df = as.numeric(ptt$parameter),
res= p.value = as.numeric(ptt$p.value)
#n.miss = n.miss)
return(res)
}
when i run it :
by(es_test,es_test$sn_id, function (es_test) {catt(es_test$ï..is_severe,es_test$encoding)})
i get these results:
es_test$sn_id: 1
[1] 0.1572992
------------------------------------------------------------------------
es_test$sn_id: 2
[1] 0.3864762
it is not a very comfortable format as i want to further work with it , is there any way to get these results as list :[0.157,0.386]?
i tried this :
result_pv=c(by(es_test,es_test$sn_id, function (es_test) {catt(es_test$ï..is_severe,es_test$encoding)}))
but it produced double and i want it as vector or list :
the double :
Browse[6]> result_pv
1 2
0.1572992 0.3864762
> typeof(result_pv)
[1] "double"
what i want to do with it later is to add this result_pv to data frame as column and when it is a double i cant do that
thank you
To cross validation for CCLE (Cancer Cell Line Encyclopedia) drug data I tried to convert the following codes from matlab to R. However, I was unsuccessful. Matlab codes work fine and can create both a *cross.mat that is a group of 10 fold CV data for each data set and a *data.mat that is the grouped data of 10 times of CV of each data set.
I will be appreciate if you can help me find my mistake.
#This function is about 10-fold cross-validation data grouping
getcrossMatrixs <- function(MM){
library(pracma)
N <- nnz(MM)
zeroM <- matrix(0L, nrow = dim(MM)[1], ncol = dim(MM)[2])
D <- randperm(N)
first <- floor(N/10)
w = which(MM != 0, arr.ind=TRUE);
nrows=w[,1]; ncols=w[,2]
crossdata <- list()
for (i in 1:10) {
crossdata[[i]] <- zeroM
}
for (i in 1:10){
for (j in (1+(i-1)*first):(i*first)){
crossdata[[i]][c(nrows[D[j]]),c(ncols[D[j]]) ] <- MM[c(nrows[D[j]]),c(ncols[D[j]])]
}
}
k <- (N-(10*first))
i <- 10*first+1
for (j in 1:k){
crossdata[[j]][c(nrows[D[i]]),c(ncols[D[i]]) ] <- MM[c(nrows[D[i]]),c(ncols[D[i]])]
i <- i+1
}
}
#The following lines is the main for calling above function.
library(foreach)
n.cores <- parallel::detectCores()
my.cluster <- parallel::makeCluster(
n.cores,
type = "PSOCK"
)
print(my.cluster)
#> socket cluster with 16 nodes on host 'localhost'
doParallel::registerDoParallel(cl = my.cluster)
foreach::getDoParRegistered()
#> [1] TRUE
CCLEdata <- list()
#MM<-matrix(read_csv("MM.csv", col_names = FALSE, show_col_types = FALSE), rownames.force = NA)
MM <- matrix(seq(0, 4.5, length.out = 11784), nrow = 491) #datamatrix like CCLE drug activity area sensitivity matrrix(491*24)
foreach(i = 1:10) %dopar% {
CCLEcross <- getcrossMatrixs(MM)
CCLEdata[[i]] <- CCLEcross
}
#> [[1]]
#> NULL
#>
#> [[2]]
#> NULL
#>
#> [[3]]
#> NULL
#>
#> [[4]]
#> NULL
#>
#> [[5]]
#> NULL
#>
#> [[6]]
#> NULL
#>
#> [[7]]
#> NULL
#>
#> [[8]]
#> NULL
#>
#> [[9]]
#> NULL
#>
#> [[10]]
#> NULL
Created on 2022-08-29 with reprex v2.0.2
Actually when I use the original CCLE dataset the error is changing in the main.R:
Error in { : task 1 failed - "is.numeric(x) || is.complex(x) is not TRUE"
or
Error in { :
task 1 failed - "attempt to select less than one element in integerOneIndex"
%These are from Matlab
function [crossdata] = getcrossMatrixs(MM)
N = nnz(MM(:));
zeroM = zeros(size(MM));
D = randperm(N);
first = floor(N/10);
[nrows,ncols] = find(MM);
crossdata = {};
for i = 1:10
crossdata{i} = zeroM;
end
for i = 1:10
for j = 1+(i-1)*first:i*first
crossdata{i}(nrows(D(j)),ncols(D(j))) = MM(nrows(D(j)),ncols(D(j)));
end
end
k=N -10*first ;
i=10*first+1;
for j=1:k
crossdata{j}(nrows(D(i)),ncols(D(i))) = MM(nrows(D(i)),ncols(D(i)));
i=i+1;
end
end
load('MM.mat')
parfor i=1:10
[CCLEcross] = getcrossMatrixs(MM);
CCLEdata{i}=CCLEcross;
end
I didn't look too closely to figure out what was wrong. I based this function on the Matlab function supplied. Note that for this particular example, going parallel is more expensive due to overhead. Parallel will provide performance with large enough matrices and/or more samples.
library(parallel)
MM <- matrix(seq(0, 4.5, length.out = 11784), nrow = 491)
getcrossMatrixs <- function(MM, parts = 10L) {
D <- sample(which(MM != 0))
first <- length(D) %/% parts
last <- length(D) %% parts
idx <- c(0L, cumsum(c(rep(first + 1L, last), rep(first, parts - last))))
mZero <- matrix(0, nrow(MM), ncol(MM))
lapply(1:parts, function(i, m) {m[D[(idx[i] + 1L):idx[i + 1L]]] <- MM[D[(idx[i] + 1L):idx[i + 1L]]]; m}, mZero)
}
reps <- 10L
clust <- makeCluster(min(detectCores() - 1L, reps))
clusterExport(clust, c("getcrossMatrixs", "MM"))
CCLEdata <- parLapply(clust, 1:reps, function(x) getcrossMatrixs(MM))
stopCluster(clust)
# check that each set of matrices returned has all elements of MM
identical(rep(list(MM), reps), lapply(1:reps, function(i) Reduce("+", CCLEdata[[i]], matrix(0, nrow(MM), ncol(MM)))))
#> [1] TRUE
And here's a cleaned-up version of the Matlab function:
function [crossdata] = getcrossMatrixs(MM)
idx = find(MM);
N = length(nrows);
zeroM = zeros(size(MM));
idx = idx(randperm(N));
first = floor(N/10);
crossdata = cell(10, 1);
for i = 1:10
crossdata{i} = zeroM;
end
for i = 1:10
j = 1 + (i - 1)*first:i*first;
crossdata{i}(idx(j)) = MM(idx(j));
end
k = N - 10*first;
j = 10*first + 1;
for i = 1:k
crossdata{i}(idx(j)) = MM(idx(j));
j = j + 1;
end
end
I have a data frame(cat_df) which has categorical variables only. I want to impute mode values to missing values in each variable.
I tried the following code. But It's not working.
Way -1
cat_df[is.na(cat_df)] <- modefunc(cat_df, na.rm = TRUE)
cat_df
modefunc <- function(x){
tabresult <- tabulate(x)
themode <- which(tabresult == max(tabresult))
if(sum(tabresult == max(tabresult))>1) themode <- NA
return(themode)
}
Error in modefunc(cat_df, na.rm = TRUE) :
unused argument (na.rm = TRUE)
Way -2
cat_df[is.na(cat_df)] <- my_mode(cat_df[!is.na(cat_df)])
cat_df
my_mode <- function(x){
unique_x <- unique(x)
mode <- unique_x[which.max(tabulate(match(x,unique_x)))]
mode
}
The above code is not not imputing the mode values
Is there any other way to impute mode values to categoriacal variables in a dataframe?
Update:
This Mode function is for dataframes:
my_mode <- function (x, na.rm) {
xtab <- table(x)
xmode <- names(which(xtab == max(xtab)))
if (length(xmode) > 1) xmode <- ">1 mode"
return(xmode)
}
for (var in 1:ncol(cat_df)) {
if (class(cat_df[,var])=="numeric") {
cat_df[is.na(cat_df[,var]),var] <- mean(cat_df[,var], na.rm = TRUE)
} else if (class(cat_df[,var]) %in% c("character", "factor")) {
cat_df[is.na(cat_df[,var]),var] <- my_mode(cat_df[,var], na.rm = TRUE)
}
}
This mode function is for vectors
Try this and please let me know.
#define missing values in vector
values <- unique(cat_column)[!is.na(cat_column)]
# mode of cat_column
themode <- values[which.max(tabulate(match(cat_column, values)))]
#assign missing vector
imputevector <- cat_column
imputevector[is.na(imputevector)] <- themode
User Defined Function
Here is the mode function I use with an additional line to choose a single mode in the event there are actually multiple modes:
my_mode <- function(x) {
ux <- unique(x)
tab <- tabulate(match(x, ux))
mode <- ux[tab == max(tab)]
ifelse(length(mode) > 1, sample(mode, 1), mode)
}
# single mode
cat_col_1 <- c(1, 1, 2, NA)
cat_col_1
#> [1] 1 1 2 NA
cat_col_1[is.na(cat_col_1)] <- my_mode(cat_col_1)
cat_col_1
#> [1] 1 1 2 1
# random sample among multimodal
cat_col_2 <- c(1, 1, 2, 2, NA)
cat_col_2
#> [1] 1 1 2 2 NA
cat_col_2[is.na(cat_col_2)] <- my_mode(cat_col_2)
cat_col_2
#> [1] 1 1 2 2 2
DescTools::Mode()
But other folks have written mode functions. One possibility is in the DescTools package and is named Mode().
Because it returns multiple modes in the event there are more than one, you would need to decide what to do in that event.
Here is an example to randomly sample with replacement, the necessary number of modes to replace the missing values.
# single mode
cat_col_3 <- c(1, 1, 2, NA)
cat_col_3
#> [1] 1 1 2 NA
cat_col_3_modes <- DescTools::Mode(cat_col_3, na.rm = TRUE)
cat_col_3_nmiss <- sum(is.na(cat_col_3))
cat_col_3[is.na(cat_col_3)] <- sample(cat_col_3_modes, cat_col_3_nmiss, TRUE)
cat_col_3
#> [1] 1 1 2 1
# random sample among multimodal
cat_col_4 <- c(1, 1, 2, 2, NA, NA)
cat_col_4
#> [1] 1 1 2 2 NA NA
cat_col_4_modes <- DescTools::Mode(cat_col_4, na.rm = TRUE)
cat_col_4_nmiss <- sum(is.na(cat_col_4))
cat_col_4[is.na(cat_col_4)] <- sample(cat_col_4_modes, cat_col_4_nmiss, TRUE)
cat_col_4
#> [1] 1 1 2 2 2 1
Created on 2021-04-16 by the reprex package (v1.0.0)
Nh<-matrix(c(17,26,30,17,23, 17 ,24, 23), nrow=2, ncol=4); Nh
Sh<-matrix(c(8.290133, 6.241174, 6.096808, 7.4449672, 6.894924, 7.692115,
4.540521, 7.409122), nrow=2, ncol=4); Sh
NhSh<-as.matrix(Nh*Sh); NhSh
rh<-c( 0.70710678, 0.40824829, 0.28867513, 0.22360680, 0.18257419,
0.15430335, 0.13363062, 0.11785113, 0.10540926, 0.09534626); rh
pv <- c()
for (j in 1:2) {
for (i in 1:4) {
pv <- rbind(pv, NhSh[j,i]*rh)
}
}
pv
row.names(pv) <- rep(c(1:2), each = 4)
lst<-lapply(split(seq_len(nrow(pv)), as.numeric(row.names(pv))), function(i)
pv[i,])
data<-40
nlargest <- function(x, data)
{
res <- order(x)[seq_len(data)];
pos <- arrayInd(res, dim(x), useNames = TRUE);
list(values = pv[res], position = pos)
}
out <- lapply(lst, nlargest, data = 40)
In continuation of above code Is there any brief way of repeating the following steps for each out$’k’$position for k in 1:2?
s1<-c(1,1,1,1); ch<-c(5,7,10,5); C<-150; a<-out$'1'$position
for (j in a[40:1, "row"] )
{
s1[j] <- s1[j]+1;
cost1 <- sum(ch*s1);
if (cost1>=C) break
}
s1; cost1
#Output [1] 5 6 6 5
# [1] 152
I have to get 2 values for 's' and 'cost' for out$k$position. I tried
mat = replicate (2,{x = matrix(data = rep(NA, 80), ncol = 2)}); mat
for (k in 1:2)
{
mat[,,k]<-out$'k'$position
}
mat
Error in mat[, , k] <- out$k$position :number of items to replace is not a multiple of replacement length
for (k in 1:2)
{
for (j in mat[,,k][40:1] ) {
s[j] <- s[j]+1
cost <- sum(ch*s)
if (cost>=C) break
}
}
s; cost
Error : Error in s[j] <- s[j] + 1 : NAs are not allowed in subscripted assignments
Please anyone help in resolving these errors.
We could apply the function directly by looping over the list. Note that each element of the list is a matrix
sapply(lst, is.matrix)
# 1 2
#TRUE TRUE
so, there is no need to unlist and create a matrix
out <- lapply(lst, nlargest, data = 40)
-checking with the OP's results
out1 <- nlargest(sub1, 40)
identical(out[[1]], out1)
#[1] TRUE
Update2
Based on the second update, we need to initialize 'cost' and 'sl' with the same length as 'k' elements. Here, we initialize 'sl' as a list of vectors
sl <- rep(list(c(1, 1, 1, 1)), 2)
C <- 150
cost <- numeric(2)
for (k in 1:2){
for (j in mat[,,k][40:1, 1] ) {
sl[[k]][j] <- sl[[k]][j]+1
cost[k] <- sum(ch*sl[[k]])
if (cost[k] >=C) break
}
}
sl
#[[1]]
#[1] 5 7 6 4
#[[2]]
#[1] 6 5 5 7
cost
#[1] 154 150