R: Applying normalization function column wise - large DataFrame/DataTable - r

I have a large r data.frame with close to 500 columns. I want to add existing scale function and also try out different normalization function in a column wise fashion.
As of existing scale function
library(dplyr)
set.seed(1234)
dat <- data.frame(x = rnorm(10, 30, .2),
y = runif(10, 3, 5),
z = runif(10, 10, 20), k = runif(10, 5, 10))
dat %>% mutate_each_(funs(scale),vars=c("y","z"))
Question1:
In this case vars are only two but when you have 500 columns to normalized whats the best way?
I tried following:
dnot <- c("y", "z")
dat %>% mutate_each_(funs(scale),vars=!(names(dat) %in% dnot))
Error:
Error in UseMethod("as.lazy_dots") :
no applicable method for 'as.lazy_dots' applied to an object of class "logical"
Question2: Instead of using inbuilt scale function I want to apply my own function to normalize the data frame.
example: I have following function
normalized_columns <- function(x)
{
r <- (x/sum(x))
}
Question2: How can I efficiently apply this to all the columns while leaving out only 3 or 4 columns.

As the OP used dplyr methods, one option would be using setdiff with mutate_each_
dat %>%
mutate_each_(funs(scale), setdiff(names(dat), dnot))
# x y z k
#1 -0.8273937 3.633225 14.56091 0.22934964
#2 0.6633811 3.605387 12.65187 0.76742806
#3 1.4738069 3.318092 13.04672 -1.16688369
#4 -1.9708424 3.079992 15.07307 0.62528427
#5 0.8157183 3.437599 11.81096 -1.06313355
#6 0.8929749 4.621197 17.59671 -0.06743894
#7 -0.1923930 4.051395 12.01248 0.94484655
#8 -0.1641660 4.829316 12.58810 -0.16575678
#9 -0.1820615 4.662690 19.92150 -1.55940662
#10 -0.5090247 3.091541 18.07352 1.45571106
Or subset the names based on the logical index
dat %>%
mutate_each_(funs(scale), names(dat)[!names(dat) %in% dnot])
# x y z k
#1 -0.8273937 3.633225 14.56091 0.22934964
#2 0.6633811 3.605387 12.65187 0.76742806
#3 1.4738069 3.318092 13.04672 -1.16688369
#4 -1.9708424 3.079992 15.07307 0.62528427
#5 0.8157183 3.437599 11.81096 -1.06313355
#6 0.8929749 4.621197 17.59671 -0.06743894
#7 -0.1923930 4.051395 12.01248 0.94484655
#8 -0.1641660 4.829316 12.58810 -0.16575678
#9 -0.1820615 4.662690 19.92150 -1.55940662
#10 -0.5090247 3.091541 18.07352 1.45571106
If we are using mutate_each, another option is one_of
dat %>%
mutate_each(funs(scale), -one_of(dnot))
# x y z k
#1 -0.8273937 3.633225 14.56091 0.22934964
#2 0.6633811 3.605387 12.65187 0.76742806
#3 1.4738069 3.318092 13.04672 -1.16688369
#4 -1.9708424 3.079992 15.07307 0.62528427
#5 0.8157183 3.437599 11.81096 -1.06313355
#6 0.8929749 4.621197 17.59671 -0.06743894
#7 -0.1923930 4.051395 12.01248 0.94484655
#8 -0.1641660 4.829316 12.58810 -0.16575678
#9 -0.1820615 4.662690 19.92150 -1.55940662
#10 -0.5090247 3.091541 18.07352 1.45571106
The setdiff option with data.table would be
library(data.table)
nm1 <- setdiff(names(dat), dnot)
setDT(dat)[, (nm1) := lapply(.SD, scale), .SDcols = nm1]

There are better approaches, but I usually do something like:
set.seed(1234)
x = rnorm(10, 30, .2)
y = runif(10, 3, 5)
z = runif(10, 10, 20)
k = runif(10, 5, 10)
a = rnorm(10, 30, .2)
b = runif(10, 3, 5)
c = runif(10, 10, 20)
d = runif(10, 5, 10)
normalized_columns <- function(x)
{
x/sum(x)
}
dat<-data.frame(x,y,z,k,a,b,c,d)
dat[,c(1,4,6:8)]<-sapply(dat[,c(1,4,6:8)], normalized_columns)
Edit: as far as efficiency goes, this is pretty fast:
set.seed(100)
dat<-data.frame(matrix(rnorm(50000, 5, 2), nrow = 100, ncol = 500))
cols<-sample.int(500, 495, replace = F)
system.time(dat[,cols]<-sapply(dat[,cols], normalized_columns))
##user system elapsed
##0.03 0.00 0.03

Related

Apply function in matrix elements of a list in R

I have a list of elements in R as follows:
set.seed(123)
A <- matrix(rnorm(20 * 20, mean = 0, sd = 1), 20, 20)
B <- matrix(rnorm(20 * 20, mean = 0, sd = 1), 20, 20)
C <- matrix(rnorm(20 * 20, mean = 0, sd = 1), 20, 20)
D <- matrix(rnorm(20 * 20, mean = 0, sd = 1), 20, 20)
E <- matrix(rnorm(20 * 20, mean = 0, sd = 1), 20, 20)
DATA <- list(A,B,C,D,E)
I want for each matrix (A,B,...,E) to find the eigenvalues and combine them in a data frame as follows:
ei1 <- eigen(DATA[[1]])
ei1 <- round(ei1$values, 2)
eigenvalues1 <- as.data.frame(ei1)
ei2 <- eigen(DATA[[2]])
ei2 <- round(ei2$values, 2)
eigenvalues2 <- as.data.frame(ei2)
ei3 <- eigen(DATA[[3]])
ei3 <- round(ei3$values, 2)
eigenvalues3 <- as.data.frame(ei3)
ei4 <- eigen(DATA[[4]])
ei4 <- round(ei4$values, 2)
eigenvalues4 <- as.data.frame(ei4)
ei5 <- eigen(DATA[[5]])
ei5 <- round(ei5$values, 2)
eigenvalues5 <- as.data.frame(ei5)
eigenavules <-
cbind(eigenvalues1,eigenvalues2,eigenvalues3,eigenvalues4,eigenvalues5
)
How can I make this procedure automatically with the apply (or similar) function in instead of manually like above?
We may use lapply to loop over the list and apply the function, extract the eigen values and then do the conversion to data.frame at the end
eigenvalues <- as.data.frame(do.call(cbind,
lapply(DATA, function(x) round(eigen(x)$values, 2))))
-output
> eigenvalues
V1 V2 V3 V4 V5
1 1.77+3.73i 5.33+0.00i 5.11+0.00i -2.52+3.53i -1.87+4.42i
2 1.77-3.73i 1.72+4.13i -5.08+0.00i -2.52-3.53i -1.87-4.42i
3 -0.50+3.97i 1.72-4.13i 2.41+3.87i 2.12+3.32i 2.96+3.44i
4 -0.50-3.97i -4.02+1.85i 2.41-3.87i 2.12-3.32i 2.96-3.44i
5 -3.38+2.06i -4.02-1.85i -2.60+3.46i 3.72+0.00i -4.15+0.00i
6 -3.38-2.06i -3.27+0.00i -2.60-3.46i -3.16+0.30i 1.67+3.35i
7 3.89+0.00i 1.48+2.89i 0.10+3.78i -3.16-0.30i 1.67-3.35i
8 -2.47+3.00i 1.48-2.89i 0.10-3.78i 2.50+1.89i 3.28+1.47i
9 -2.47-3.00i 3.05+0.00i 3.74+0.00i 2.50-1.89i 3.28-1.47i
10 3.51+0.00i -0.97+2.79i 2.38+2.10i -2.69+1.46i -2.88+1.40i
11 2.04+2.29i -0.97-2.79i 2.38-2.10i -2.69-1.46i -2.88-1.40i
12 2.04-2.29i -1.86+2.07i -2.44+0.01i -1.04+2.51i -1.32+2.89i
13 -3.03+0.00i -1.86-2.07i -2.44-0.01i -1.04-2.51i -1.32-2.89i
14 -1.97+1.67i -2.18+0.00i -1.52+1.78i 0.69+2.32i -0.77+2.12i
15 -1.97-1.67i 2.14+0.00i -1.52-1.78i 0.69-2.32i -0.77-2.12i
16 0.81+1.91i 1.61+0.77i 1.93+0.86i 2.23+0.85i 1.40+1.09i
17 0.81-1.91i 1.61-0.77i 1.93-0.86i 2.23-0.85i 1.40-1.09i
18 1.02+0.00i 0.14+1.55i -0.04+1.88i -0.77+0.57i 0.65+0.35i
19 -0.57+0.47i 0.14-1.55i -0.04-1.88i -0.77-0.57i 0.65-0.35i
20 -0.57-0.47i -0.99+0.00i 0.26+0.00i 0.67+0.00i 0.58+0.00i
Not quite what you are looking for, but maybe:
df <- NULL
for (i in length(DATA)) {
di <- as.data.frame(round(eigen(DATA[[i]])$values,2))
df <- if (is.null(df)) di else cbind(df,di)
}

Replace integers in a data frame column with other integers in R?

I want to replace a vector in a dataframe that contains only 4 numbers to specific numbers as shown below
tt <- rep(c(1,2,3,4), each = 10)
df <- data.frame(tt)
I want to replace 1 = 10; 2 = 200, 3 = 458, 4 = -0.1
You could use recode from dplyr. Note that the old values are written as character. And the new values are integers since the original column was integer:
library(tidyverse):
df %>%
mutate(tt = recode(tt, '1'= 10, '2' = 200, '3' = 458, '4' = -0.1))
tt
1 10.0
2 10.0
3 200.0
4 200.0
5 458.0
6 458.0
7 -0.1
8 -0.1
To correct the error in the code in the question and provide for a shorter example we use the input in the Note at the end. Here are several alternatives. nos defined in (1) is used in some of the others too. No packages are used.
1) indexing To get the result since the input is 1 to 4 we can use indexing. This is probably the simplest solution given that the original values of tt are in 1:4.
nos <- c(10, 200, 458, -0.1)
transform(df, tt = nos[tt])
## tt
## 1 10.0
## 2 10.0
## 3 200.0
## 4 200.0
## 5 458.0
## 6 458.0
## 7 -0.1
## 8 -0.1
1a) If the input is not necessarily in 1:4 then we could use this generalization
transform(df, tt = nos[match(tt, 1:4)])
2) arithmetic Another approach is to use arithmetic:
transform(df, tt = 10 * (tt == 1) +
200 * (tt == 2) +
458 * (tt == 3) +
-0.1 * (tt == 4))
3) outer/matrix multiplication This would also work:
transform(df, tt = c(outer(tt, 1:4, `==`) %*% nos))
3a) This is the same except we use model.matrix instead of outer.
transform(df, tt = c(model.matrix(~ factor(tt) + 0, df) %*% nos))
4) factor The levels of the factor are 1:4 and the corresponding labels are defined by nos. Extract the labels using format and then convert them to numeric.
transform(df, tt = as.numeric(format(factor(tt, levels = 1:4, labels = nos))))
4a) or as a pipeline
transform(df, tt = tt |>
factor(levels = 1:4, labels = nos) |>
format() |>
as.numeric())
5) loop We can use a simple loop. Nulling out i at the end is so that it is not made into a column.
within(df, { for(i in 1:4) tt[tt == i] <- nos[i]; i <- NULL })
6) Reduce This is somewhat similar to (5) but implements the loop using Reduce.
fun <- function(tt, i) replace(tt, tt == i, nos[i])
transform(df, tt = Reduce(fun, init = tt, 1:4))
Note
df <- data.frame(tt = c(1, 1, 2, 2, 3, 3, 4, 4))

Thousand separator to numeric columns in R

I am trying to format numbers as shown (adding thousand separator). The function is working fine but post formatting the numbers, the numeric columns does not sort by numbers since there are characters
df <- data.frame(x = c(12345,35666,345,5646575))
format_numbers <- function (df, column_name){
df[[column_name]] <- ifelse(nchar(df[[column_name]]) <= 5, paste(format(round(df[[column_name]] / 1e3, 1), trim = TRUE), "K"),
paste(format(round(df[[column_name]] / 1e6, 1), trim = TRUE), "M"))
}
df$x <- format_numbers(df,"x")
> df
x
1 12.3 K
2 35.7 K
3 0.3 K
4 5.6 M
Can we make sure the numbers are sorted in descending/ascending order post formatting ?
Note : This data df is to be incorporated in DT table
The problem is the formating part. If you do it correctly--ie while maintaining your data as numeric, then everything else will fall in place. Here I will demonstrate using S3 class:
my_numbers <- function(x) structure(x, class = c('my_numbers', 'numeric'))
format.my_numbers <- function(x,..., d = 1, L = c('', 'K', 'M', 'B', 'T')){
ifelse(abs(x) >= 1000, Recall(x/1000, d = d + 1),
sprintf('%.1f%s', x, L[d]))
}
print.my_numbers <- function(x, ...) print(format(x), quote = FALSE)
'[.my_numbers' <- function(x, ..., drop = FALSE) my_numbers(NextMethod('['))
Now you can run your code:
df <- data.frame(x = c(12345,35666,345,5646575))
df$x <- my_numbers(df$x)
df
x
1 12.3K
2 35.7K
3 345.0
4 5.6M
You can use any mathematical operation on column x as it is numeric.
eg:
cbinding with its double and ordering from smallest to larges:
cbind(x = df, y = df*2)[order(df$x),]
x x
3 345.0 690.0 # smallest
1 12.3K 24.7K
2 35.7K 71.3K
4 5.6M 11.3M # largest ie Millions
Note that under the hood, x does not change:
unclass(df$x)
[1] 12345 35666 345 5646575 # Same as given

Summarize matrix of boolean variables

I have a bunch of Boolean variables. I want to summarize them and show the percentage of positive values. The big thing in this question is that the variables are logical organized in two dimensions.
The result I want should look like this (kind of):
a b
v1_1 30% 60%
v1_2 60% 50%
Here is a minimal working (self running) example.
#!/usr/bin/env Rscript
set.seed(0)
df <- data.frame(v1_1_a = sample(c(T,F), 10, replace=TRUE),
v1_1_b = sample(c(T,F), 10, replace=TRUE),
v1_2_a = sample(c(T,F), 10, replace=TRUE),
v1_2_b = sample(c(T,F), 10, replace=TRUE))
my_percent <- function (col) { return (100 / length(col) * sum(col)) }
p <- apply(df, 2, my_percent)
print(p)
This is the output:
v1_1_a v1_1_b v1_2_a v1_2_b
30 60 60 50
Just for information: The real data has 80 Boolean variables logical organized in a 10 x 8 matrix.
If you don't mind having to add in the row and column names, you could use colMeans together with the matrix construction function to build a matrix with the desired structure.
myMat <- matrix(colMeans(df), 2, byrow = TRUE)
MyMat
[,1] [,2]
[1,] 0.3 0.6
[2,] 0.6 0.5
If desired, you could add the names using dimnames. In this instance,
dimnames(myMat) <- list(paste0("V1", 1:2), letters[1:2])
will do the trick.
You could break up the metric names into separate columns.
With dplyr and tidyr:
p <- data.frame(p)
p$metric <- row.names(p)
p %>% mutate(metric_1 = ifelse(grepl('v1_1_', metric), "v1_1", "v1_2"),
metric_2 = ifelse(grepl('a', metric), 'a', 'b')) %>%
select(-metric) %>%
spread(key = metric_2, value = p)
Giving...
metric_1 a b
1 v1_1 30 60
2 v1_2 60 50
You could set the row names to get exactly what you want:
row.names(p) <- p$metric_1
p %<>% select(-metric_1)
Resulting in...
a b
v1_1 30 60
v1_2 60 50

subset data.frame union multiple intervals

Consider the following data.frame,
d <- data.frame(x = seq(0, 10, length=100), value = rnorm(100))
I wish to subset based on x belonging to any of the following intervals,
intervals <- list(c(0.2, 0.8), c(1, 2), c(8, 8.2))
test <- function(range, x){
which(x >= range[1] & x <= range[2])
}
d[Reduce(`union`, lapply(intervals, test, x=d$x)), ]
Now, the testing function seems redundant, as it looks an awful lot like the built-in findInterval, but I cannot find an elegant way to use it.
condition <- Reduce(`|`, lapply(lapply(intervals, findInterval,
x=d$x, all.inside=FALSE), `==`, 1))
d[condition, ]
Can you suggest better?
d[unlist(sapply(intervals, function(x) which(!is.na(cut(d$x,x))))),]
x value
3 0.2020202 0.15488314
4 0.3030303 -0.06891842
5 0.4040404 1.59909655
6 0.5050505 0.31006866
7 0.6060606 1.68986821
8 0.7070707 0.18500635
11 1.0101010 0.18721091
12 1.1111111 0.32485063
13 1.2121212 -0.42728405
14 1.3131313 0.84220081
15 1.4141414 -1.30745237
16 1.5151515 -1.90335389
17 1.6161616 -0.47139683
18 1.7171717 0.01622827
19 1.8181818 0.76362918
20 1.9191919 -0.37827765
81 8.0808081 0.46672521
82 8.1818182 1.27038641
Edit: The same result using findInterval
d[findInterval(d$x,unlist(intervals))%%2==1,]
Here is a solution with the intervals package.
d <- data.frame(x = seq(0, 10, length=100), value = rnorm(100))
intervals <- list(c(0.2, 0.8), c(1, 2), c(8, 8.2))
library(intervals)
intervals <- Intervals( do.call( rbind, intervals ) )
intervals <- reduce( intervals ) # Simplify, if they overlap
condition <- distance_to_nearest(d$x, intervals) == 0
# The following would allow for non-closed intervals,
# but it is awfully slow.
condition <- sapply( d$x, function(u)
any(!empty(interval_intersection( Intervals(c(u,u)), intervals ))))
d[condition,]
With findInterval, it may be trickier,
because it assumes the intervals are closed on one side and open on the other.
If this is acceptable, if the intervals are ordered and do not overlap,
you just need to check if the interval number is odd.
intervals <- list(c(0.2, 0.8), c(1, 2), c(8, 8.2))
condition <- findInterval( d$x, unlist(intervals) ) %% 2 == 1
d[condition,]

Resources