save an output into a vector in R? - r

I have the following function which finds the distinct number of cases belonging to 4 different factors. test is a list containing 4 dataframes
for (i in test){
i<-i%>%distinct(FileNumber)%>%nrow()
print(i)
}
when i run this, I get the following output
[1] 38
[1] 129
[1] 1868
[1] 277
However I want this output to be saved into another vector called my_vector. So that my_vector is
38 129 1868 277
So I tried the following based on this answer I found
Saving results from for loop as a vector in r
library(dplyr)
my_vector<-vector("numeric",4L)
for (i in test){
my_vector[i]<-i%>%distinct(FileNumber)%>%nrow()
}
However when I run this I get the following message
Error in my_vector[i] <- i %>% distinct(FileNumber) %>% nrow() :
invalid subscript type 'list'
How do I get the earlier output I listed saved into a vector?

You are trying to index my_vector with a list-like object.
For instance:
mylist <- list(mtcars, mtcars)
myvec <- numeric(length(mylist))
for (i in mylist) {
myvec[i] <- nrow(distinct(i, cyl))
}
On the first (and second in this example) iteration, i is a frame, so myvec[i] is equivalent to myvec[mtcars], which does not make sense.
Instead, loop over the index of the list of frames, ala:
library(dplyr)
mylist <- list(mtcars, mtcars)
myvec <- numeric(length(mylist))
for (i in seq_len(length(mylist))) {
myvec[i] <- test[[i]] %>% distinct(cyl) %>% nrow()
}
myvec
# [1] 3 3
or just do something like:
sapply(mylist, function(l) l %>% distinct(cyl) %>% nrow())
# [1] 3 3
BTW: this is just as easy in base-R with:
sapply(mylist, function(l) length(unique(l[["cyl"]])))
# [1] 3 3

This should work with a list of data frames or matrices
d <- list(a = matrix(rnorm(100), nrow = 20),
b = matrix(rnorm(100), nrow = 10),
c = matrix(rnorm(100), nrow = 50))
my_vect <- c()
for (i in seq_along(d)){
n <- nrow(d[[i]])
my_vect[i] <- n
}
my_vect
[1] 20 10 50

Use unlist() and if that doesn't work, then add as.vector() in your pipe:
for (i in test){
i<-i %>% distinct(FileNumber) %>% nrow() %>% unlist()
print(i)
}
If that does not come out as a vector then:
for (i in test){
i<-i %>% distinct(FileNumber) %>% nrow() %>% unlist() %>% as.vector()
print(i)
}

Related

mean for replicate lists in R?

I have simulation and data structures as follows (just a toy example):
foo = function(mu=0,lambda=1){
x1 = rnorm(1,mu) #X~N(μ,1)
y1 = rexp(1,lambda) #Y~Exp(λ)
list(x=x1,y=y1)
}
mu = 1; lambda = 2 #true values: E(X)=μ=1; E(Y)=1/λ=0.5
set.seed(0); out = replicate(1000, foo(mu,lambda), simplify=FALSE)
# str(out)
Then we get a list out of length(out)=1000, with each list having out$x and out$y.
I want to compute the means for 1000 out$xs and out$ys, respectively.
Of course, I can reach my goal through a not-clever way as
m = c() #for storing simulated values
for(i in 1:2){
s = sapply( 1:1000, function(j)out[[j]][i] )
m[i] = mean( as.numeric(s) )
}
m
# [1] 0.9736922 0.4999028
Can we use a more simple and efficient way to compute the means? I also try lapply(out, mean)
and Reduce("+",out)/1000, but failed...
This is another option if the sublists are always the same length:
> rowMeans(matrix(unlist(out),2))
[1] 0.9736922 0.4999028
Or:
> rowMeans(replicate(1000,unlist(foo(mu,lambda))))
x y
0.9736922 0.4999028
An option is to use purrr::transpose
library(purrr)
out %>% transpose() %>% map(~ mean(unlist(.x)[1:1000]))
# Or: out[1:1000] %>% transpose() %>% map(~ mean(unlist(.x)))
#$x
#[1] 0.9736922
#
#$y
#[1] 0.4999028
Or a base R solution using lapply (which is essentially the same as your explicit for loop):
lapply(c("x", "y"), function(var) mean(sapply(out[1:1000], "[[", var)))
#[[1]]
#[1] 0.9736922
#
#[[2]]
#[1] 0.4999028

Correctly Understanding the Use of the replicate() function

I have the following function:
library(dplyr)
var_1 <- rnorm(100, 10, 10)
var_2 <- rnorm(100, 1, 10)
var_3 <- rnorm(100, 5, 10)
response <- rnorm(100, 1, 1)
my_data <- data.frame(var_1, var_2, var_3, response)
my_data$id <- 1:100
simulate <- function() {
results <- list()
results2 <- list()
for (i in 1:100) {
iteration_i <- i
sample_i <- my_data[sample(nrow(my_data), 10), ]
results_tmp <- data.frame(iteration_i, sample_i)
results[[i]] <- results_tmp
}
results_df <- do.call(rbind.data.frame, results)
test_1 <- data.frame(results_df %>%
group_by(id) %>%
filter(iteration_i == min(iteration_i)) %>%
distinct)
summary_file <- data.frame(test_1 %>%
group_by(iteration_i) %>%
summarise(Count=n()))
cumulative <- cumsum(summary_file$Count)
summary_file$Cumulative <- cumulative
summary_file$unobserved <- 100 - cumulative
return(summary_file)
}
When I call this function, I get the following output:
> head(simulate())
iteration_i Count Cumulative unobserved
1 1 10 10 90
2 2 7 17 83
3 3 10 27 73
4 4 5 32 68
5 5 7 39 61
6 6 8 47 53
I want to try to run this function 10 times and append all the results into a single file. I tried to do this using the "replicate()" function - but this is not working:
# Method 1 : Did not work
n_replicates = 10
iterations_required <- replicate(n_replicates, {
simulate()
})
# Method 2: Did not work
lapply(seq_len(10), simulate(1))
# Method 3: Did Not Work
library(purrr)
rerun(10, simulate(1))
# Method 4: Did Not Work
lapply(seq_len(10), simulate)
Ideally, I would like to get something like this:
# works fine!
results <- list()
for (i in 1:10) {
game_i <- i
s_i <- simulate()
results_tmp <- data.frame(game_i, s_i)
results[[i]] <- results_tmp
}
final_file <- do.call(rbind.data.frame, results)
My Question: Is there a reason that "Method 1, Method 2, Method 3, Method 4" were not working - could someone please show me how to fix this?
# Method 1 :
n_replicates = 10
iterations_required <- replicate(n_replicates, {
simulate()
}, simplify=FALSE)
# Method 2:
iterations_required<-lapply(seq_len(10), function(x) simulate(1))
# Method 4:
iterations_required<-lapply(seq_len(10), simulate)
# to merge into one data.frame
as.data.frame(data.table::rbindlist(iterations_required, idcol=TRUE))
Alternatively, if you modify your function to simulate(i), where i will be the first column in the output (interation index). Then you could use do.call(rbind.data.frame, lapply(seq_len(n_replicates), simulate))
replicate by default tries to simplify the result in a matrix. So the trick is actually just not to simplify.
n_replicates<- 10
iterations_required <- replicate(n_replicates, simulate(), simplify=FALSE)

Function argument as value or column name for data.table

I want my function to be able to take a value or a column name. How can I do this with data.table?
library(data.table)
df <- data.table(a = c(1:5),
b = c(5:1),
c = c(1, 3, 5, 3, 1))
myfunc <- function(val) {
df[a >= val]
}
# This works:
myfunc(2)
# This does not work:
myfunc("c")
If I define my function as:
myfunc <- function(val) {
df[a >= get(val)]
}
# This doesn't work:
myfunc(2)
# This works:
myfunc("c")
What is the best way to resolve this?
Edit: To be clear, I want to results to be the same as:
# myfunc(2)
df %>%
filter(a >= 2)
# myfunc("c")
df %>%
filter(a >= c)
EDIT:
Thanks all for the responses, I think I like dww's answer the best.
I wish it was as easy as in dplyr, where I can do:
myfunc <- function(val) {
df %>%
filter(a >= {{val}})
}
# Both work:
myfunc(2)
myfunc(c)
If you build and parse the whole expression, then you can evaluate it in its entirety. For example
myfunc <- function(val) {
df[eval(parse(text=paste("a >= ", val)))]
}
Though relying on a function that lets you mix values and variable names in the same parameter might be dangerous. Especially in the case where you actually wanted to match on character values rather than variable names. If you passed in the whole expression you could do
myfunc <- function(expr) {
expr <- substitute(expr)
df[eval(expr)]
}
myfunc(a>=3)
myfunc(a>=c)
The question did not actually define the desired behavior so we assume that df must be a data.table and if a character string is passed then the column of that name should be returned and if a number is passed then those rows whose a column exceed that number should be returned.
Define an S3 generic and methods for character and default.
myfunc <- function(x, data = df) UseMethod("myfunc")
myfunc.character <- function(x, data = df) data[[x]]
myfunc.default <- function(x, data = df) data[a > x]
myfunc(2)
## a b c
## 1: 3 3 5
## 2: 4 2 3
## 3: 5 1 1
myfunc("c")
## [1] 1 3 5 3 1

Combining two loop into single with certain condition in R

With the following x vector I would like to create a vector of size 6 (grt1) with two conditions. For unique values of x, take the sub as below. For other positions of grt1 like 3,5, and 6, data from std normal.
h=c(1:6)
grt1=numeric(length(h)) #Null vector
x=c(1,2,2,4,4,4)
for (i in unique(x)){
f=rep(x[x==i],3)
grt1[i]=sum(f)
} ##Condition-1
for( j in c(3,5,6))
{
grt1[j]=rnorm(1)
} ##Condition 2
The above code is working. But I want to make them a general statement by not specifying c(3,5,6) in the second condition.
Any help is appreciated.
Here is an option with split
v1 <- sapply(split(x, x), function(x) sum(rep(x, 3)))
grt1[grt1 == 0] <- rnorm(sum(grt1 == 0))
grt1
#[1] 3.0000000 12.0000000 0.3993774 36.0000000 -0.8149975 -1.2508617
If we need to use the OP's loop, get the index of elements that are 0 (because 'grt1' is initialized as a numeric vector of 0s) with which, loop over it and assign those elements with rnorm
for(j in which(grt1 == 0)) grt1[j] <- rnorm(1)
grt1
#[1] 3.000000 12.000000 -0.765245 36.000000 -1.350172 -1.081518
NOTE: rnorm is Vectorized, so a for loop is not really needed here
Or with tidyverse
library(dplyr)
library(tidyr)
tibble(x) %>%
group_by(x) %>%
summarise(Sum = sum(rep(x, 3)), .groups = 'drop') %>%
complete(x = h) %>%
transmute(Sum = replace(Sum, is.na(Sum), rnorm(sum(is.na(Sum))))) %>%
pull(Sum)
#[1] 3.0000000 12.0000000 -1.9279778 36.0000000 0.7900143 -0.2506099
which(duplicated(x)) would give you c(3, 5, 6). Try :
inds <- which(duplicated(x))
for (i in unique(x)) {
f=rep(x[x==i],3)
grt1[i]=sum(f)
}
for( j in inds){
grt1[j]=rnorm(1)
}

R pipe operator for indexing

I want to have the same kind of pipe operator as dplyr's %>%, but for indexing instead of passing functions. For example I want to define function %l% such that for at least vectors and matrices:
1 %l% df would be equivalent to df[1] (vector) or df[1,] (matrices).
abc <- c("a","b", "c")
def <- c("d","e", "f")
df <- data.frame(abc, def, stringsAsFactors = F)
df %l% 1
You could do this, for example. Subsetting rows and columns for both vectors and matrices / data.frames.
`%l%` <- lineindex <- function(x, y) {
result <- NA
if(is.null(dim(y))|(!is.null(dim(y))&ncol(y)==1)) {
result <- y[x]
} else if(length(dim(y))==2) {
result <- y[x,]
}
return(result)
}
`%c%` <- colindex <- function(x, y) {
result <- NA
if(is.null(dim(y))|(!is.null(dim(y))&ncol(y)==1)) {
result <- y[x]
} else if(length(dim(y))==2) {
result <- y[,x]
}
return(result)
}
c(1,NA,2) %>% is.na() %>% which() %l% df
abc def
2 b e
c(1,NA,2) %>% is.na() %>% which() %c% df
[1] "d" "e" "f"

Resources