Modify multiple columns of dataset by the same function in R - r

Here is a code to generate a data.frame :
ref_variables=LETTERS[1:10]
row=100
d0=seq(1:100)
for (i in seq_along(ref_variables)){
dtemp=sample(seq(1:row),row,TRUE)
d0=data.frame(d0,dtemp)
}
d0[,1]=NULL
names(d0)=ref_variables
I have a dataset, data.frame or data.table, whatever.
Let's say I want to modify the columns 2 to 4 by dividing each of them by the first one. Of Course, I can make a loop like this :
columns_name_to_divide=c("B","C","H")
column_divisor="A"
for (i in seq_along(columns_name_to_divide)){
ds[columns_name_to_divide[i]] = ds[columns_name_to_divide[i]] / ds[column_divisor]
}
But is there a way more elegant to do it?

> d0[2:4] <- d0[,2:4]/d0[,1]
This will substitute your original values with result you get after dividing column 2,3,4 by column 1. Rest will remain the same.
If you want to create 3 new columns in d0 with new values after dividing column 2,3,4 by column 1 This will not replace the original values in column 2,3, and 4. The calculated values would be in column 11,12 and 13 respectively.
> dim(d0)
# [1] 100 10
> d0[11:13] <- d0[,2:4]/d0[,1]
> dim(d0)
# [1] 100 13
To round up the new values, you can simply add round() function to 2 decimal places like below:
> d0[2:4] <- round(d0[,2:4]/d0[,1],2) # Original values subtituted at 2,3,4
# OR
> d0[11:13] <- round(d0[,2:4]/d0[,1],2) # New columns added, original columns are untouched.

We can use set from data.table which would be make this more efficient as the overhead of .[data.table is avoided when called multiple times (though not in this case).
library(data.table)
setDT(d0)
for(j in columns_name_to_divide){
set(d0, i = NULL, j = j, value = d0[[j]]/d0[[column_divisor]])
}
Or using lapply
setDT(d0)[, (columns_name_to_divide) := lapply(.SD, `/`,
d0[[column_divisor]]), .SDcols = columns_name_to_divide]
Or an elegant option using dplyr
library(dplyr)
library(magrittr)
d0 %<>%
mutate_each_(funs(./d0[[column_divisor]]), columns_name_to_divide)
head(d0)
# A B C D E F G H I J
#1 60 0.4000000 1.1500000 6 86 27 19 0.150000 94 97
#2 11 0.6363636 0.3636364 25 52 44 82 8.818182 84 68
#3 80 0.8750000 1.1375000 72 34 56 69 0.125000 34 17
#4 77 0.3116883 1.0259740 9 44 87 61 1.064935 79 40
#5 18 0.3333333 5.0555556 60 69 62 89 2.166667 21 34
#6 42 1.3333333 2.3095238 61 20 87 95 1.428571 78 63
Benchmarks
set.seed(42)
d1 <- as.data.frame(matrix(sample(1:9, 1e7*7, replace=TRUE), ncol=7))
d2 <- copy(d1)
d3 <- copy(d1)
system.time({
d2 %<>%
mutate_each(funs(./d2[["V2"]]), V4:V7)
})
# user system elapsed
# 0.52 0.39 0.91
system.time({
d1[,4:7] <- d1[,4:7]/d1[,2]
})
# user system elapsed
# 1.72 0.72 2.44
system.time({
setDT(d3)
for(j in 4:7){
set(d3, i = NULL, j = j, value = d3[[j]]/d3[["V2"]])
}
})
# user system elapsed
# 0.32 0.16 0.47

You can do this:
library(data.table)
cols <- names(df)[2:4]
col1 <- names(df)[1]
setDT(df)[, (cols) := lapply (cols, function(x) get(x) / get(col1) )]
# sample data for reproducible example:
df <- data.frame(V1=rep(10,5),
V2=rep(20,5),
V3=rep(30,5),
V4=rep(40,5),
V5=rep(50,5))

Related

R data.table divide set of columns and flag using any

I am working on a data set which is large and having many columns. I am using data.table to speed up the calculations. However at certain points I am not sure how to go about and convert my data.table back to data.frame and do the calculation. This slows up the process. It would help a lot to have suggestions on how I can write the below in data.table. Below is a snap of my code on a dummy data -
library(data.table)
#### set the seed value
set.seed(9901)
#### create the sample variables for creating the data
p01 <- sample(1:100,1000,replace = T)
p02 <- sample(1:100,1000,replace = T)
p03 <- sample(1:100,1000,replace = T)
p04 <- sample(1:100,1000,replace = T)
p05 <- sample(1:100,1000,replace = T)
p06 <- sample(1:100,1000,replace = T)
p07 <- sample(1:100,1000,replace = T)
#### create the data.table
data <- data.table(cbind(p01,p02,p03,p04,p05,p06,p07))
###user input for last column
lcol <- 6
###calculate start column as last - 3
scol <- lcol-3
###calculate average for scol:lcol
data <- data[,avg:= apply(.SD,1,mean,na.rm=T),.SDcols=scol:lcol]
###converting to data.frame since do not know the solution in data.table
data <- as.data.frame(data)
###calculate the trend in percentage
data$t01 <- data[,lcol-00]/data[,"avg"]-1
data$t02 <- data[,lcol-01]/data[,"avg"]-1
data$t03 <- data[,lcol-02]/data[,"avg"]-1
data$t04 <- data[,lcol-03]/data[,"avg"]-1
data$t05 <- data[,lcol-04]/data[,"avg"]-1
###converting back to data.table
data <- as.data.table(data)
###calculate the min and max for the trend
data1 <- data[,`:=` (trend_min = apply(.SD,1,min,na.rm=T),
trend_max = apply(.SD,1,max,na.rm=T)),.SDcols=c(scol:lcol)]
###calculate flag if any of t04 OR t05 is an outlier for min and max values. This would be many columns in actual data
data1$flag1 <- ifelse(data1$t04 < data1$trend_min | data1$t04 > data1$trend_max,1,0)
data1$flag2 <- ifelse(data1$t05 < data1$trend_min | data1$t05 > data1$trend_max,1,0)
data1$flag <- ifelse(data1$flag1 == 1 | data1$flag2 == 1,1,0)
So basically, how can I -
calculate the percentages based on user input of column index. Note it is not simple divide but percentage
How can I create the flag variable....I think I need to use any function but not sure how....
Some steps can be made more efficient, i.e. instead of using the apply with MARGIN = 1, the mean, min, max can be replaced with rowMeans, pmin, pmax
library(data.table)
data[ , avg:= rowMeans(.SD, na.rm = TRUE) ,.SDcols=scol:lcol]
data[, sprintf('t%02d', 1:5) := lapply(.SD, function(x) x/avg -1),
.SDcol = patterns("^p0[1-5]")]
data[,`:=` (trend_min = do.call(pmin, c(.SD,na.rm=TRUE)),
trend_max = do.call(pmax, c(.SD,na.rm=TRUE)) ),.SDcols=c(scol:lcol)]
data
# p01 p02 p03 p04 p05 p06 p07 avg t01 t02 t03 t04 t05 trend_min trend_max
# 1: 35 53 22 82 100 59 69 65.75 -0.46768061 -0.19391635 -0.6653992 0.24714829 0.5209125 22 100
# 2: 78 75 15 65 70 69 66 54.75 0.42465753 0.36986301 -0.7260274 0.18721461 0.2785388 15 70
# 3: 15 45 27 61 63 75 99 56.50 -0.73451327 -0.20353982 -0.5221239 0.07964602 0.1150442 27 75
# 4: 41 80 13 22 63 84 17 45.50 -0.09890110 0.75824176 -0.7142857 -0.51648352 0.3846154 13 84
# 5: 53 9 75 47 25 75 66 55.50 -0.04504505 -0.83783784 0.3513514 -0.15315315 -0.5495495 25 75
# ---
# 996: 33 75 9 61 74 55 57 49.75 -0.33668342 0.50753769 -0.8190955 0.22613065 0.4874372 9 74
# 997: 24 68 74 11 43 75 37 50.75 -0.52709360 0.33990148 0.4581281 -0.78325123 -0.1527094 11 75
# 998: 62 78 82 97 56 50 74 71.25 -0.12982456 0.09473684 0.1508772 0.36140351 -0.2140351 50 97
# 999: 70 88 93 4 39 75 93 52.75 0.32701422 0.66824645 0.7630332 -0.92417062 -0.2606635 4 93
#1000: 20 50 99 94 62 66 98 80.25 -0.75077882 -0.37694704 0.2336449 0.17133956 -0.2274143 62 99
and then create the 'flag'
data[, flag := +(Reduce(`|`, lapply(.SD, function(x)
x < trend_min| x > trend_max))), .SDcols = t04:t05]

How to randomly select row from a dataframe for which the row skewness is larger that a given value in R

I am trying to select random rows from a data frame with 1000 lines (and six columns) where the skewness of the line is larger than a given value (say Sk > 0.3).
I've generated the following data frame
df=data.frame(replicate(6,sample(10:100,1000,rep=TRUE)))
I can get row skewness from the fbasics package:
rowSkewness(df) gives:
[8] -0.2243295435 0.5306809351 0.0707122386 0.0341447417 0.3339384838 -0.3910593364 -0.6443905090
[15] 0.5603809206 0.4406091534 -0.3736108832 0.0397860038 0.9970040772 -0.7702547535 0.2065830354
But now, I need to select say 10 rows of the df which have rowskewness greater than say 0.1... May with
for (a in 1:10) {
sample.data[a,] = sample(x=df[which(rowSkewness(df[sample(1:nrow(df),1)>0.1),], size = 1, replace = TRUE)
}
or something like this?
Any thoughts on this will be appreciated.
thanks in advance.
you can use the sample_n() function or sample_frac() - makes your version a little shorter:
library(tidyr)
library(fBasics)
df=data.frame(replicate(6,sample(10:100,1000,rep=TRUE)))
x=df %>% dplyr::filter(rowSkewness(df)>0.1) %>% dplyr::sample_n(10)
Got it:
x=df %>% filter(rowSkewness(df)>0.1)
for (a in 1:samplesize) {
sample.data[a,] = sample(x=x, size = 1, replace = TRUE)
}
Just do a subset:
res1 <- DF[fBasics::rowSkewness(DF) > .1, ]
head(res1)
# X1 X2 X3 X4 X5 X6
# 7 56 28 21 93 74 24
# 8 33 56 23 44 10 12
# 12 29 19 29 38 94 95
# 13 35 51 54 98 66 10
# 14 12 51 24 23 36 68
# 15 50 37 81 22 55 97
Or with e1071::skewness:
res2 <- DF[apply(as.matrix(DF), 1, e1071::skewness) > .1, ]
stopifnot(all.equal(res1, res2))
Data
set.seed(42); DF <- data.frame(replicate(6, sample(10:100, 1000, rep=TRUE)))

Normalise only some columns in R

I'm new to R and still getting to grips with how it handles data (my background is spreadsheets and databases). the problem I have is as follows. My data looks like this (it is held in CSV):
RecNo Var1 Var2 Var3
41 800 201.8 Y
43 140 39 N
47 60 20.24 N
49 687 77 Y
54 570 135 Y
58 1250 467 N
61 211 52 N
64 96 117.3 N
68 687 77 Y
Column 1 (RecNo) is my observation number; while it is a number, it is not required for my analysis. Column 4 (Var3) is a Yes/No column which, again, I do not currently need for the analysis but will need later in the process to add information in the output.
I need to normalise the numeric data in my dataframe to values between 0 and 1 without losing the other information. I have the following function:
normalize <- function(x) {
x <- sweep(x, 2, apply(x, 2, min))
sweep(x, 2, apply(x, 2, max), "/")
}
However, when I apply it to my above data by calling
myResult <- normalize(myData)
it returns an error because of the text in Column 4. If I set the text in this column to binary values it runs fine, but then also normalises my case numbers, which I don't want.
So, my question is: How can I change my normalize function above to accept the names of the columns to transform, while outputting the full dataset (i.e. without losing columns)?
I could not get TUSHAr's suggestion to work, but I have found two solutions that work fine:
1. akrun's suggestion above:
myData2 <- myData1 %>% mutate_at(2:3, funs((.-min(.))/max(.-min(.))))
This produces the following:
RecNo Var1 Var2 Var3
1 41 0.62184874 0.40601834 Y
2 43 0.06722689 0.04195255 N
3 47 0.00000000 0.00000000 N
4 49 0.52689076 0.12693105 Y
5 54 0.42857143 0.25663508 Y
6 58 1.00000000 1.00000000 N
7 61 0.12689076 0.07102414 N
8 64 0.03025210 0.21718329 N
9 68 0.52689076 0.12693105 Y
Alternatively, there is the package BBmisc which allowed me the following after transforming my record numbers to factors:
> myData <- myData %>% mutate(RecNo = factor(RecNo))
> myNorm <- normalize(myData2, method="range", range = c(0,1), margin = 1)
> myNorm
RecNo Var1 Var2 Var3
1 41 0.62184874 0.40601834 Y
2 43 0.06722689 0.04195255 N
3 47 0.00000000 0.00000000 N
4 49 0.52689076 0.12693105 Y
5 54 0.42857143 0.25663508 Y
6 58 1.00000000 1.00000000 N
7 61 0.12689076 0.07102414 N
8 64 0.03025210 0.21718329 N
9 68 0.52689076 0.12693105 Y
EDIT: For completion I include TUSHAr's solution as well, showing as always that there are many ways around a single problem:
normalize<-function(x){
minval=apply(x[,c(2,3)],2,min)
maxval=apply(x[,c(2,3)],2,max)
#print(minval)
#print(maxval)
y=sweep(x[,c(2,3)],2,minval)
#print(y)
sweep(y,2,(maxval-minval),"/")
}
df[,c(2,3)]=normalize(df)
Thank you for your help!
normalize<-function(x){
minval=apply(x[,c(2,3)],2,min)
maxval=apply(x[,c(2,3)],2,max)
#print(minval)
#print(maxval)
y=sweep(x[,c(2,3)],2,minval)
#print(y)
sweep(y,2,(maxval-minval),"/")
}
df[,c(2,3)]=normalize(df)

Fisher's exact test on values from large dataframe and bypassing errors

I have a dataframe which is 214 columns long and many rows long, and I want to perform a fisher's exact test for each row using values from 4 columns.
An example subset of relevant information from my dataframe looks like:
Variant DB.count.1 DB.count.2 pop.count.1 pop.count.2
A 23 62 35 70
B 81 4 39 22
C 51 42 49 52
D NA NA 65 8
E 73 21 50 33
F 72 13 81 10
G 61 32 75 21
H NA NA 42 22
I NA NA 60 20
J 80 12 72 24
I am trying to use a for loop to:
create a contingency table for each row for the Fisher's exact test to compare DB.counts to pop.counts
run a Fisher's exact test using this contingency table to determine if there is a difference between DB.counts and pop.counts
output the p-value result to a new column on my dataframe
As you can see there are "NA" values in some positions and thus in some contingency tables, obviously this will cause an error, which is ok, but I would like for the code to output a value to the column when it encounters this error such as "." or "error" and skip to the next row/contingency table.
i.e. I would like an output which looks like this:
Variant DB.count.1 DB.count.2 pop.count.1 pop.count.2 fishers
A 23 62 35 70 0.4286
B 81 4 39 22 <0.0001
C 51 42 49 52 0.3921
D NA NA 65 8 error
E 73 21 50 33 0.0143
F 72 13 81 10 0.5032
G 61 32 75 21 0.0744
H NA NA 42 22 error
I NA NA 60 20 error
J 80 12 72 24 0.0425
The code I currently have (based on R loop over Fisher test - Error message) is:
df$fishers" <- for (i in 1:nrow(df))
{
table <- matrix(c(df[i,4], df[i,5], df[i,2], df[i,3]), ncol = 2, byrow = TRUE)
fisher.test(table, alternative="greater")
}
This seems to create the contingency tables the way I want but the problem of bypassing the errors and printing the p-vlaue to the new column remains. I have tried to use try and tryCatch but have been unsuccessful in doing so.
I am an R beginner so really appreciate any advice on how to improve my questions or any advice for my problem! Thank you!
Edit 1: I have now tried using the data.table package as below and have got what I need from data sets with no "NA" values but how do I skip the errors and make the code continue? Thanks!!!
library(data.table)
dt <- data.table(df)
dt[, p.val := fisher.test(matrix(c(pop.count.1, pop.count.2, DB.count.1, DB.count.2), ncol=2), workspace=1e9)$p.value, by=Variant]
df <- as.data.frame(dt)
You can include an if-else statement in your loop like this:
res <- NULL
for (i in 1:nrow(df)){
table <- matrix(c(df[i,4], df[i,5], df[i,2], df[i,3]), ncol = 2, byrow = TRUE)
# if any NA occurs in your table save an error in p else run the fisher test
if(any(is.na(table))) p <- "error" else p <- fisher.test(table, alternative="greater")$p.value
# save all p values in a vector
res <- c(res,p)
}
df$fishers <- res
Or put the code in a function and use apply instead of a loop:
foo <- function(y){
# include here as.numeric to be sure that your values are numeric:
table <- matrix(as.numeric(c(y[4], y[5], y[2], y[3])), ncol = 2, byrow = TRUE)
if(any(is.na(table))) p <- "error" else p <- fisher.test(table, alternative="greater")$p.value
p
}
df$fishers <- apply(df, 1, foo)

Apply over all columns and rows of two diffrent dataframes in R

I try to apply a function over all rows and columns of two dataframes but I don't know how to solve it with apply.
I think the following script explains what I intend to do and the way i tried to solve it. Any advice would be warmly appreciated! Please note, that the simplefunction is only intended to be an example function to keep it simple.
# some data and a function
df1<-data.frame(name=c("aa","bb","cc","dd","ee"),a=sample(1:50,5),b=sample(1:50,5),c=sample(1:50,5))
df2<-data.frame(name=c("aa","bb","cc","dd","ee"),a=sample(1:50,5),b=sample(1:50,5),c=sample(1:50,5))
simplefunction<-function(a,b){a+b}
# apply on a single row
simplefunction(df1[1,2],df2[1,2])
# apply over all colums
apply(?)
## apply over all columns and rows
# create df to receive results
df3<-df2
# loop it
for (i in 2:5)df3[i]<-apply(?)
My first mapply answer!! For your simple example you have...
mapply( FUN = `+` , df1[,-1] , df2[,-1] )
# a b c
# [1,] 60 35 75
# [2,] 57 39 92
# [3,] 72 71 48
# [4,] 31 19 85
# [5,] 47 66 58
You can extend it like so...
mapply( FUN = function(x,y,z,etc){ simplefunctioncodehere} , df1[,-1] , df2[,-1] , ... other dataframes here )
The dataframes will be passed in order to the function, so in this example df1 would be x, df2 would be y and z and etc would be some other dataframes that you specify in that order. Hopefully that makes sense. mapply will take the first row, first column values of all dataframes and apply the function, then the first row, second column of all data frames and apply the function and so on.
You can also use Reduce:
set.seed(45) # for reproducibility
Reduce(function(x,y) { x + y}, list(df1[, -1], df2[,-1]))
# a b c
# 1 53 22 23
# 2 64 28 91
# 3 19 56 51
# 4 38 41 53
# 5 28 42 30
You can just do :
df1[,-1] + df2[,-1]
Which gives :
a b c
1 52 24 37
2 65 63 62
3 31 90 89
4 90 35 33
5 51 33 45

Resources