Apply function in matrix elements of a list in R - r

I have a list of elements in R as follows:
set.seed(123)
A <- matrix(rnorm(20 * 20, mean = 0, sd = 1), 20, 20)
B <- matrix(rnorm(20 * 20, mean = 0, sd = 1), 20, 20)
C <- matrix(rnorm(20 * 20, mean = 0, sd = 1), 20, 20)
D <- matrix(rnorm(20 * 20, mean = 0, sd = 1), 20, 20)
E <- matrix(rnorm(20 * 20, mean = 0, sd = 1), 20, 20)
DATA <- list(A,B,C,D,E)
I want for each matrix (A,B,...,E) to find the eigenvalues and combine them in a data frame as follows:
ei1 <- eigen(DATA[[1]])
ei1 <- round(ei1$values, 2)
eigenvalues1 <- as.data.frame(ei1)
ei2 <- eigen(DATA[[2]])
ei2 <- round(ei2$values, 2)
eigenvalues2 <- as.data.frame(ei2)
ei3 <- eigen(DATA[[3]])
ei3 <- round(ei3$values, 2)
eigenvalues3 <- as.data.frame(ei3)
ei4 <- eigen(DATA[[4]])
ei4 <- round(ei4$values, 2)
eigenvalues4 <- as.data.frame(ei4)
ei5 <- eigen(DATA[[5]])
ei5 <- round(ei5$values, 2)
eigenvalues5 <- as.data.frame(ei5)
eigenavules <-
cbind(eigenvalues1,eigenvalues2,eigenvalues3,eigenvalues4,eigenvalues5
)
How can I make this procedure automatically with the apply (or similar) function in instead of manually like above?

We may use lapply to loop over the list and apply the function, extract the eigen values and then do the conversion to data.frame at the end
eigenvalues <- as.data.frame(do.call(cbind,
lapply(DATA, function(x) round(eigen(x)$values, 2))))
-output
> eigenvalues
V1 V2 V3 V4 V5
1 1.77+3.73i 5.33+0.00i 5.11+0.00i -2.52+3.53i -1.87+4.42i
2 1.77-3.73i 1.72+4.13i -5.08+0.00i -2.52-3.53i -1.87-4.42i
3 -0.50+3.97i 1.72-4.13i 2.41+3.87i 2.12+3.32i 2.96+3.44i
4 -0.50-3.97i -4.02+1.85i 2.41-3.87i 2.12-3.32i 2.96-3.44i
5 -3.38+2.06i -4.02-1.85i -2.60+3.46i 3.72+0.00i -4.15+0.00i
6 -3.38-2.06i -3.27+0.00i -2.60-3.46i -3.16+0.30i 1.67+3.35i
7 3.89+0.00i 1.48+2.89i 0.10+3.78i -3.16-0.30i 1.67-3.35i
8 -2.47+3.00i 1.48-2.89i 0.10-3.78i 2.50+1.89i 3.28+1.47i
9 -2.47-3.00i 3.05+0.00i 3.74+0.00i 2.50-1.89i 3.28-1.47i
10 3.51+0.00i -0.97+2.79i 2.38+2.10i -2.69+1.46i -2.88+1.40i
11 2.04+2.29i -0.97-2.79i 2.38-2.10i -2.69-1.46i -2.88-1.40i
12 2.04-2.29i -1.86+2.07i -2.44+0.01i -1.04+2.51i -1.32+2.89i
13 -3.03+0.00i -1.86-2.07i -2.44-0.01i -1.04-2.51i -1.32-2.89i
14 -1.97+1.67i -2.18+0.00i -1.52+1.78i 0.69+2.32i -0.77+2.12i
15 -1.97-1.67i 2.14+0.00i -1.52-1.78i 0.69-2.32i -0.77-2.12i
16 0.81+1.91i 1.61+0.77i 1.93+0.86i 2.23+0.85i 1.40+1.09i
17 0.81-1.91i 1.61-0.77i 1.93-0.86i 2.23-0.85i 1.40-1.09i
18 1.02+0.00i 0.14+1.55i -0.04+1.88i -0.77+0.57i 0.65+0.35i
19 -0.57+0.47i 0.14-1.55i -0.04-1.88i -0.77-0.57i 0.65-0.35i
20 -0.57-0.47i -0.99+0.00i 0.26+0.00i 0.67+0.00i 0.58+0.00i

Not quite what you are looking for, but maybe:
df <- NULL
for (i in length(DATA)) {
di <- as.data.frame(round(eigen(DATA[[i]])$values,2))
df <- if (is.null(df)) di else cbind(df,di)
}

Related

Replace integers in a data frame column with other integers in R?

I want to replace a vector in a dataframe that contains only 4 numbers to specific numbers as shown below
tt <- rep(c(1,2,3,4), each = 10)
df <- data.frame(tt)
I want to replace 1 = 10; 2 = 200, 3 = 458, 4 = -0.1
You could use recode from dplyr. Note that the old values are written as character. And the new values are integers since the original column was integer:
library(tidyverse):
df %>%
mutate(tt = recode(tt, '1'= 10, '2' = 200, '3' = 458, '4' = -0.1))
tt
1 10.0
2 10.0
3 200.0
4 200.0
5 458.0
6 458.0
7 -0.1
8 -0.1
To correct the error in the code in the question and provide for a shorter example we use the input in the Note at the end. Here are several alternatives. nos defined in (1) is used in some of the others too. No packages are used.
1) indexing To get the result since the input is 1 to 4 we can use indexing. This is probably the simplest solution given that the original values of tt are in 1:4.
nos <- c(10, 200, 458, -0.1)
transform(df, tt = nos[tt])
## tt
## 1 10.0
## 2 10.0
## 3 200.0
## 4 200.0
## 5 458.0
## 6 458.0
## 7 -0.1
## 8 -0.1
1a) If the input is not necessarily in 1:4 then we could use this generalization
transform(df, tt = nos[match(tt, 1:4)])
2) arithmetic Another approach is to use arithmetic:
transform(df, tt = 10 * (tt == 1) +
200 * (tt == 2) +
458 * (tt == 3) +
-0.1 * (tt == 4))
3) outer/matrix multiplication This would also work:
transform(df, tt = c(outer(tt, 1:4, `==`) %*% nos))
3a) This is the same except we use model.matrix instead of outer.
transform(df, tt = c(model.matrix(~ factor(tt) + 0, df) %*% nos))
4) factor The levels of the factor are 1:4 and the corresponding labels are defined by nos. Extract the labels using format and then convert them to numeric.
transform(df, tt = as.numeric(format(factor(tt, levels = 1:4, labels = nos))))
4a) or as a pipeline
transform(df, tt = tt |>
factor(levels = 1:4, labels = nos) |>
format() |>
as.numeric())
5) loop We can use a simple loop. Nulling out i at the end is so that it is not made into a column.
within(df, { for(i in 1:4) tt[tt == i] <- nos[i]; i <- NULL })
6) Reduce This is somewhat similar to (5) but implements the loop using Reduce.
fun <- function(tt, i) replace(tt, tt == i, nos[i])
transform(df, tt = Reduce(fun, init = tt, 1:4))
Note
df <- data.frame(tt = c(1, 1, 2, 2, 3, 3, 4, 4))

Error in while (e_i$X1 < 12 | e_i$X2 < 12) { : argument is of length zero

In an earlier question (R: Logical Conditions Not Being Respected), I learned how to make the following simulation :
Step 1: Keep generating two random numbers "a" and "b" until both "a" and "b" are greater than 12
Step 2: Track how many random numbers had to be generated until it took for Step 1 to be completed
Step 3: Repeat Step 1 and Step 2 100 times
res <- matrix(0, nrow = 0, ncol = 3)
for (j in 1:100){
a <- rnorm(1, 10, 1)
b <- rnorm(1, 10, 1)
i <- 1
while(a < 12 | b < 12) {
a <- rnorm(1, 10, 1)
b <- rnorm(1, 10, 1)
i <- i + 1
}
x <- c(a,b,i)
res <- rbind(res, x)
}
head(res)
[,1] [,2] [,3]
x 12.14232 12.08977 399
x 12.27158 12.01319 1695
x 12.57345 12.42135 302
x 12.07494 12.64841 600
x 12.03210 12.07949 82
x 12.34006 12.00365 782
Question: Now, I am trying to make a slight modification to the above code - Instead of "a" and "b" being produced separately, I want them to be produced "together" (in math terms: "a" and "b" were being produced from two independent univariate normal distributions, now I want them to come from a bivariate normal distribution).
I tried to modify this code myself:
library(MASS)
Sigma = matrix(
c(1,0.5, 0.5, 1), # the data elements
nrow=2, # number of rows
ncol=2, # number of columns
byrow = TRUE) # fill matrix by rows
res <- matrix(0, nrow = 0, ncol = 3)
for (j in 1:100){
e_i = data.frame(mvrnorm(n = 1, c(10,10), Sigma))
e_i$i <- 1
while(e_i$X1 < 12 | e_i$X2 < 12) {
e_i = data.frame(mvrnorm(n = 1, c(10,10), Sigma))
e_i$i <- i + 1
}
x <- c(e_i$X1, e_i$X2 ,i)
res <- rbind(res, x)
}
res = data.frame(res)
But this is producing the following error:
Error in while (e_i$X1 < 12 | e_i$X2 < 12) { : argument is of length
zero
If I understand your code correctly you are trying to see how many samples occur before both values are >=12 and doing that for 100 trials? This is the approach I would take:
library(MASS)
for(i in 1:100){
n <- 1
while(any((x <- mvrnorm(1, mu=c(10,10), Sigma=diag(0.5, nrow=2)+0.5))<12)) n <- n+1
if(i==1) res <- data.frame("a"=x[1], "b"=x[2], n)
else res <- rbind(res, data.frame("a"=x[1], "b"=x[2], n))
}
Here I am assigning the results of a mvrnorm to x within the while() call. In that same call, it evaluates whether either are less than 12 using the any() function. If that evaluates to FALSE, n (the counter) is increased and the process repeated. Once TRUE, the values are appended to your data.frame and it goes back to the start of the for-loop.
Regarding your code, the mvrnorm() function is returning a vector, not a matrix, when n=1 so both values go into a single variable in the data.frame:
data.frame(mvrnorm(n = 1, c(10,10), Sigma))
Returns:
mvrnorm.n...1..c.10..10...Sigma.
1 9.148089
2 10.605546
The matrix() function within your data.frame() calls, along with some tweaks to your use of i, will fix your code:
library(MASS)
Sigma = matrix(
c(1,0.5, 0.5, 1), # the data elements
nrow=2, # number of rows
ncol=2, # number of columns
byrow = TRUE) # fill matrix by rows
res <- matrix(0, nrow = 0, ncol = 3)
for (j in 1:10){
e_i = data.frame(matrix(mvrnorm(n = 1, c(10,10), Sigma), ncol=2))
i <- 1
while(e_i$X1[1] < 12 | e_i$X2[1] < 12) {
e_i = data.frame(matrix(mvrnorm(n = 1, c(10,10), Sigma), ncol=2))
i <- i + 1
}
x <- c(e_i$X1, e_i$X2 ,i)
res <- rbind(res, x)
}
res = data.frame(res)

What solves my problem: Map, reduce or a recursion?

I really need some help to write a recursion in R.
The function that I want changes a certain observation according to a set of comparisons between different rows in a data frame, which I shall call g. One of these comparisons depends on the previous value of this same observation.
Suppose first that I want to update the value of column index, row i in my data df in the following way:
j <- 1:4
g <- (df$dom[i] > 0 &
abs(df$V2009[i] - df$V2009[j]) <= w) |
df$index[i] == df$index[j]
df$index[i] <- ifelse(any(g), which(g)[[1]], df$index[[i]])
The thing is, the object w is actually a list:
w = list(0, 1, 2, df$age[i])
So, as you can see, I want to create a function foo() that updates df$index iteratively. It changes it by looping through w and comparisons depend on updated values.
Here is some data:
df <- data.frame(dom = c(0, 0, 6, 6),
V2009 = c(9, 11, 9, 11),
index = c(1, 2, 1, 2),
age = c(2, 2, 2, 2))
I am not sure if a recursive function is actually needed or if something like reduce or map would do it.
Thank you!
The following function uses a double for loop to change the values of column index according to the condition defining g. It accepts a data.frame as input and returns the updated data.frame.
foo <- function(x){
change_index <- function(x, i, w){
j <- seq_len(nrow(x))
(x$dom[i] > 0 & abs(x$V2009[i] - x$V2009[j]) <= w) |
x$index[i] == x$index[j]
}
for(i in seq_len(nrow(x))){
W <- list(0, 1, 2, x$age[i])
for(w in W){
g <- change_index(x, i, w)
if(any(g)) x$index[i] <- which(g)[1]
}
}
x
}
foo(df)
# dom V2009 index age
#1 0 9 1 2
#2 0 11 2 2
#3 6 9 1 2
#4 6 11 1 2
One can define w inside a function and use lexical scoping (closure).
Using your instructions, the function index_value calculates for any given i the index value.
correct_index_col returns the corrected df.
df <- data.frame(dom = c(0, 0, 6, 6),
V2009 = c(9, 11, 9, 11),
index = c(1, 2, 1, 2),
age = c(2, 2, 2, 2))
index_value <- function(df, i) {
j <- nrow(df)
w <- c(0, 1, 2, df$age[i])
g <- (df$dom[i] > 0 & abs(df$V2009[i] - df$V2009[j]) <= w) |
df$index[i] == df$index[j]
ifelse(any(g), which(g)[[1]], df$index[[i]])
}
correct_index_col <- function(df) {
indexes <- Vectorize(function(i) {
index_value(df, i)
})
df$index <- indexes(1:nrow(df))
df
}
# > correct_index_col(df)
# dom V2009 index age
# 1 0 9 1 2
# 2 0 11 1 2
# 3 6 9 3 2
# 4 6 11 1 2
#
If you want to really update (mutate) your df, then you have to do
df <- correct_index_col(df).
Here is an attempt of my own. I guess I figured out a way to use recursion over mutate:
test <- function(i, df, k){
j <- 1:nrow(df)
w <- list(0, 1, 2, df$age[i])
g <- (df$dom[i] > 0 & abs(df$V2009[i] - df$V2009[j]) <= w[k]) |
df$index[i] == df$index[j]
l <- ifelse(any(g), which(g)[1], df$index[i])
return(l)
}
loop <- function(data,
k = 1) {
data <- data %>%
mutate(index = map_dbl(seq(n()),
~ test(.x, df = cur_data(), k)))
if (k == 4) {
return(data)
} else {
return(loop(data, k + 1))
}
}
df %>% loop()
I welcome any comments in case this is inefficient considering large datasets

R: Applying normalization function column wise - large DataFrame/DataTable

I have a large r data.frame with close to 500 columns. I want to add existing scale function and also try out different normalization function in a column wise fashion.
As of existing scale function
library(dplyr)
set.seed(1234)
dat <- data.frame(x = rnorm(10, 30, .2),
y = runif(10, 3, 5),
z = runif(10, 10, 20), k = runif(10, 5, 10))
dat %>% mutate_each_(funs(scale),vars=c("y","z"))
Question1:
In this case vars are only two but when you have 500 columns to normalized whats the best way?
I tried following:
dnot <- c("y", "z")
dat %>% mutate_each_(funs(scale),vars=!(names(dat) %in% dnot))
Error:
Error in UseMethod("as.lazy_dots") :
no applicable method for 'as.lazy_dots' applied to an object of class "logical"
Question2: Instead of using inbuilt scale function I want to apply my own function to normalize the data frame.
example: I have following function
normalized_columns <- function(x)
{
r <- (x/sum(x))
}
Question2: How can I efficiently apply this to all the columns while leaving out only 3 or 4 columns.
As the OP used dplyr methods, one option would be using setdiff with mutate_each_
dat %>%
mutate_each_(funs(scale), setdiff(names(dat), dnot))
# x y z k
#1 -0.8273937 3.633225 14.56091 0.22934964
#2 0.6633811 3.605387 12.65187 0.76742806
#3 1.4738069 3.318092 13.04672 -1.16688369
#4 -1.9708424 3.079992 15.07307 0.62528427
#5 0.8157183 3.437599 11.81096 -1.06313355
#6 0.8929749 4.621197 17.59671 -0.06743894
#7 -0.1923930 4.051395 12.01248 0.94484655
#8 -0.1641660 4.829316 12.58810 -0.16575678
#9 -0.1820615 4.662690 19.92150 -1.55940662
#10 -0.5090247 3.091541 18.07352 1.45571106
Or subset the names based on the logical index
dat %>%
mutate_each_(funs(scale), names(dat)[!names(dat) %in% dnot])
# x y z k
#1 -0.8273937 3.633225 14.56091 0.22934964
#2 0.6633811 3.605387 12.65187 0.76742806
#3 1.4738069 3.318092 13.04672 -1.16688369
#4 -1.9708424 3.079992 15.07307 0.62528427
#5 0.8157183 3.437599 11.81096 -1.06313355
#6 0.8929749 4.621197 17.59671 -0.06743894
#7 -0.1923930 4.051395 12.01248 0.94484655
#8 -0.1641660 4.829316 12.58810 -0.16575678
#9 -0.1820615 4.662690 19.92150 -1.55940662
#10 -0.5090247 3.091541 18.07352 1.45571106
If we are using mutate_each, another option is one_of
dat %>%
mutate_each(funs(scale), -one_of(dnot))
# x y z k
#1 -0.8273937 3.633225 14.56091 0.22934964
#2 0.6633811 3.605387 12.65187 0.76742806
#3 1.4738069 3.318092 13.04672 -1.16688369
#4 -1.9708424 3.079992 15.07307 0.62528427
#5 0.8157183 3.437599 11.81096 -1.06313355
#6 0.8929749 4.621197 17.59671 -0.06743894
#7 -0.1923930 4.051395 12.01248 0.94484655
#8 -0.1641660 4.829316 12.58810 -0.16575678
#9 -0.1820615 4.662690 19.92150 -1.55940662
#10 -0.5090247 3.091541 18.07352 1.45571106
The setdiff option with data.table would be
library(data.table)
nm1 <- setdiff(names(dat), dnot)
setDT(dat)[, (nm1) := lapply(.SD, scale), .SDcols = nm1]
There are better approaches, but I usually do something like:
set.seed(1234)
x = rnorm(10, 30, .2)
y = runif(10, 3, 5)
z = runif(10, 10, 20)
k = runif(10, 5, 10)
a = rnorm(10, 30, .2)
b = runif(10, 3, 5)
c = runif(10, 10, 20)
d = runif(10, 5, 10)
normalized_columns <- function(x)
{
x/sum(x)
}
dat<-data.frame(x,y,z,k,a,b,c,d)
dat[,c(1,4,6:8)]<-sapply(dat[,c(1,4,6:8)], normalized_columns)
Edit: as far as efficiency goes, this is pretty fast:
set.seed(100)
dat<-data.frame(matrix(rnorm(50000, 5, 2), nrow = 100, ncol = 500))
cols<-sample.int(500, 495, replace = F)
system.time(dat[,cols]<-sapply(dat[,cols], normalized_columns))
##user system elapsed
##0.03 0.00 0.03

Loop through data frames based upon name

I have another simple r question that hopefully someone can help with. I have a series of dataframes that have a repetitive name structure. I would like to loop through them and perform some analysis. Here is hardcoded example of what I want to do using some fake data:
#Create some fake data
n1 = c(2, 3, 5, 7)
s1 = c(1, 1, 2, 0)
b1 = c(6, 0, 0, 0)
Tank001.df = data.frame(n1, s1, b1)
n2 = c(1, 2, 4, 6)
s2 = c(2, 2, 0, 0)
b2 = c(8, 9, 10, 0)
Tank002.df = data.frame(n2, s2, b2)
n3 = c(7, 12, 0, 0)
s3 = c(5, 3, 0, 0)
b3 = c(8, 9, 10, 4)
Tank003.df = data.frame(n3, s3, b3)
The first action I would like to automate is the conversion of 0 values to "NA". Here is the harcoded version but I would ideally automate this dependant on how many Tankxxx.df dataframes I have:
#Convert zeros to NA
Tank001.df[Tank001.df==0] <- NA
Tank002.df[Tank002.df==0] <- NA
Tank003.df[Tank003.df==0] <- NA
Finally I would like to complete a series of queries of the data, a simple example of which might be the number of values smaller than 5 in each dataframe:
#Return the number of values smaller than 5
Tank001.less.than.5 <- numeric(length(Tank001.df))
for (i in 1:(length(Tank001.df))) {Tank001.less.than.5[i] <- sum(Tank001.df[[i]] < 5,na.rm=TRUE)}
Tank002.less.than.5 <- numeric(length(Tank002.df))
for (i in 1:(length(Tank002.df))) {Tank002.less.than.5[i] <- sum(Tank002.df[[i]] < 5,na.rm=TRUE)}
Tank003.less.than.5 <- numeric(length(Tank003.df))
for (i in 1:(length(Tank003.df))) {Tank003.less.than.5[i] <- sum(Tank003.df[[i]] < 5,na.rm=TRUE)}
Ideally I would also like to know how to write the results of such simple calculations to a new dataframe. In this case for example Less.than.5$TankXXX etc.
Any help would be greatly appreciated.
Create a list of your data.frames and use a combination of lapply and sapply as follows:
TankList <- list(Tank001.df, Tank002.df, Tank003.df)
lapply(TankList, function(x) {
x[x == 0] <- NA
sapply(x, function(y) sum(y < 5, na.rm = TRUE))
})
# [[1]]
# n1 s1 b1
# 2 3 0
#
# [[2]]
# n2 s2 b2
# 3 2 0
#
# [[3]]
# n3 s3 b3
# 0 1 1
This also works with a single lapply and colSums:
l <- list(Tank001.df, Tank002.df, Tank003.df) # create a list
lapply(l, function(x) colSums("is.na<-"(x, !x) < 5, na.rm = TRUE))
# [[1]]
# n1 s1 b1
# 2 3 0
#
# [[2]]
# n2 s2 b2
# 3 2 0
#
# [[3]]
# n3 s3 b3
# 0 1 1

Resources