Only include outliers from each column in a dataframe

Only include outliers from each column in a dataframe - r

I have a dataframe as follows:
chr leftPos TBGGT 12_try 324Gtt AMN2
1 24352 34 43 19 43
1 53534 2 1 -1 -9
2 34 -15 7 -9 -18
3 3443 -100 -4 4 -9
3 3445 -100 -1 6 -1
3 3667 5 -5 9 5
3 7882 -8 -9 1 3
I have to create a loop which:
a) Calculates the upper and lower limit (UL and LL) for each column from the third column onwards.
b) Only includes rows that fall outside of the UL and LL (Zoutliers).
c) Then count the number of rows where the Zoutlier is the same direction (i.e. positive or negative) as the previous or the subsequent row for the same chr.
The output would therefore be:
ZScore1 TBGGT 12_try 324Gtt AMN2
nrow 4 6 4 4
So far I have code as follows:
library(data.table)#v1.9.5
f1 <- function(df, ZCol){
#A) Determine the UL and LL and then generate the Zoutliers
UL = median(ZCol, na.rm = TRUE) + alpha*IQR(ZCol, na.rm = TRUE)
LL = median(ZCol, na.rm = TRUE) - alpha*IQR(ZCol, na.rm = TRUE)
Zoutliers <- which(ZCol > UL | ZCol < LL)
#B) Exclude Zoutliers per chr if same direction as previous or subsequent row
na.omit(as.data.table(df)[, {tmp = sign(eval(as.name(ZCol)))
.SD[tmp==shift(tmp) | tmp==shift(tmp, type='lead')]},
by=chr])[, list(.N)]}
nm1 <- paste0(names(df)
setnames(do.call(cbind,lapply(nm1, function(x) f1(df, x))), nm1)[]
The code is patched together from various places. The problem I have is combining parts A) and B) of the code to get the output I want

Can you try this function? I was not sure what alpha is, so I could not reproduce the expected output and included it as variable in the function.
# read your data per copy&paste
d <- read.table("clipboard",header = T)
# or as in Frank comment mentioned solution via fread
d <- data.table::fread("chr leftPos TBGGT 12_try 324Gtt AMN2
1 24352 34 43 19 43
1 53534 2 1 -1 -9
2 34 -15 7 -9 -18
3 3443 -100 -4 4 -9
3 3445 -100 -1 6 -1
3 3667 5 -5 9 5
3 7882 -8 -9 1 3")
# set up the function
foo <- function(x, alpha, chr){
# your code for task a) and b)
UL = median(x, na.rm = TRUE) + alpha*IQR(x, na.rm = TRUE)
LL = median(x, na.rm = TRUE) - alpha*IQR(x, na.rm = TRUE)
Zoutliers <- which(x > UL | x < LL)
# part (c
# factor which specifies the direction. 0 values are set as positives
pos_neg <- ifelse(x[Zoutliers] >= 0, "positive", "negative")
# count the occurrence per chromosome and direction.
aggregate(x[Zoutliers], list(chr[Zoutliers], pos_neg), length)
}
# apply over the columns and get a list of dataframes with number of outliers per chr and direction.
apply(d[,3:ncol(d)], 2, foo, 0.95, d$chr)

Related

R: How to fill out values in a DF which are dependent on previous rows

I have a dataframe, and I want to do some calculations depending on the previous rows (like dragging informations down in excel). My DF looks like this:
set.seed(1234)
df <- data.frame(DA = sample(1:3, 6, rep = TRUE) ,HB = sample(0:600, 6, rep = TRUE), D = sample(1:5, 6, rep = TRUE), AD = sample(1:14, 6, rep = TRUE), GM = sample(30:31, 6, rep = TRUE), GL = NA, R =NA, RM =0 )
df$GL[1] = 646
df$R[1] = 60
df$DA[5] = 2
df
# DA HB D AD GM GL R RM
# 1 2 399 4 13 30 646 60 0
# 2 2 97 4 10 31 NA NA 0
# 3 1 102 5 5 31 NA NA 0
# 4 3 325 4 2 31 NA NA 0
# 5 2 78 3 14 30 NA NA 0
# 6 1 269 4 8 30 NA NA 0
I want to fill out the missing values in my GL, R and RM columns, and the values are dependent on each other. So eg.
attach(df)
#calc GL and R for the 2nd row
df$GL[2] <- GL[1]+HB[2]+RM[1]
df$R[2] <- df$GL[2]*D[2]/GM[2]*AD[2]
#calc GL and R for the 3rd row
df$GL[3] <- df$GL[2]+HB[3]+df$RM[2]
df$R[3] <-df$GL[3]*D[3]/GM[3]*AD[3]
#and so on..
Is there a way to do all the calculations at once, instead of row by row?
In addition, each time the column 'DA' = 1, the previous values for 'R' should be summed up for the same row for 'RM', but only from the last occurence. So that
attach(df)
df$RM[3] <-R[1]+R[2]+R[3]
#and RM for the 6th row is calculated by
#df$RM[6] <-R[4]+R[5]+R[6]
Thanks a lot in advance!

You can use a for loop to calculate GL values and once you have them you can do the calculation for R columns directly.
for(i in 2:nrow(df)) {
df$GL[i] <- with(df, GL[i-1]+HB[i]+RM[i-1])
}
df$R <- with(df, (GL* D)/(GM *AD))

You can use indexing to solve the first two problems:
> # Original code from question~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> set.seed(1234)
> df <- data.frame(DA = sample(1:3, 6, rep = TRUE), HB = sample(0:600, 6, rep = TRUE),
+ D = sample(1:5, 6, rep = TRUE), AD = sample(1:14, 6, rep = TRUE),
+ GM = sample(30:31, 6, rep = TRUE), GL = NA, R =NA, RM =0 )
> df$GL[1] = 646
> df$R[1] = 60
> df$DA[5] = 2
> #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> # View df
> df
DA HB D AD GM GL R RM
1 2 399 4 13 30 646 60 0
2 2 97 4 10 31 NA NA 0
3 1 102 5 5 31 NA NA 0
4 3 325 4 2 31 NA NA 0
5 2 78 3 14 30 NA NA 0
6 1 269 4 8 30 NA NA 0
> # Solution below, based on indexing
> # 1. GL column
> df$GL <- cumsum(c(df$GL[1], df$HB[-1] + df$RM[-nrow(df)]))
> # 2. R column
> df$R[-1] <- (df$GL * df$D / df$GM * df$AD)[-1]
> # May be more clear like this (same result)
> df$R[-1] <- df$GL[-1] * df$D[-1] / df$GM[-1] * df$AD[-1]
> # Or did you mean this for last *?
> df$R[-1] <- (df$GL * df$D / (df$GM * df$AD))[-1]
The third problem can be solved with a loop.
> df$RM[1] <- df$R[1]
> for (i in 2:nrow(df)) {
+ df$RM[i] <- df$R[i] + df$RM[i-1] * (df$DA[i] != 2)
+ }
> df
DA HB D AD GM GL R RM
1 2 399 4 13 30 646 60.000000 60.000000
2 2 97 4 10 31 743 9.587097 9.587097
3 1 102 5 5 31 845 27.258065 36.845161
4 3 325 4 2 31 1170 75.483871 112.329032
5 2 78 3 14 30 1248 8.914286 8.914286
6 1 269 4 8 30 1517 25.283333 34.197619
Do these results look correct?
Update: Assuming RM should = R unless DA = 1, and in that case RM = sum of current row and previous R up to (not including) the above row with DA = 1, try the following loop.
df$RM[1] <- cs <- df$R[1]
for (i in 2:nrow(df)) {
df$RM[i] <- df$R[i] + cs * (df$DA[i] == 1)
cs <- cs * (df$DA[i] != 1) + df$R[i]
}

t-test through all combinations of all factors all levels

I have a dataframe with the following structure:
> str(data_l)
'data.frame': 800 obs. of 5 variables:
$ Participant: int 1 2 3 4 5 6 7 8 9 10 ...
$ Temperature: Factor w/ 4 levels "35","37","39",..: 3 3 3 3 3 3 3 3 3 3 ...
$ Region : Factor w/ 5 levels "Eyes","Front",..: 3 3 3 3 3 3 3 3 3 3 ...
$ Time : Factor w/ 5 levels "0","15","30",..: 1 1 1 1 1 1 1 1 1 1 ...
$ Rating : num 5 5 5 4 5 5 5 5 5 5 ...
I want to run one-sample t-test for each combination of all factors all levels, for a total of 4*5*5 = 100 t-tests, with Rating as dependent variables, or y.
I am stuck at looping through the combinations, and performing t-test at each combo.
I tried splitting the dataframe by the factors, then lapply t.test() through the list, but to no avail.
Does anyone have a better approach? Cheers!
Edit
My ultimate intention is to calculate confidence interval for arrays in all factors all levels. For instance, I was able to do this:
subset1 <- data_l$Rating[data_l$Temperature == 35 & data_l$Region == "Front" & data_l$Time == 0]
Then,
t.test(subset1)$conf.int
But the problem is I will have to do this 100 times.
Edit 2
I am recreating the dataframe.
Temperature <- rep(seq(35, 41, 2), 10)
Region <- rep(c("Front", "Back", "Eyes", "Left", "Right"), 8)
Time <- rep(seq(0, 60, 15), 8)
Rating <- sample(1:5, 40, replace = TRUE)
data_l <- data.frame(Region = factor(Region), Temperature = factor(Temperature), Time = factor(Time), Rating = as.numeric(Rating))

Two things.
Can this be done? Certainly. Should it? Many of your combinations may have insufficient data to find a reasonable confidence interval. While your data sample is certainly reduced and simplified, I don't have assurances that there will be sufficient fillingness of your factor combinations.
table(sapply(split(data_l$Rating, data_l[,c("Temperature","Region","Time")]), length))
# 0 2
# 80 20
(There are 80 "empty" combinations of your factor levels.)
Let's try this:
outs <- aggregate(data_l$Rating, data_l[,c("Temperature","Region","Time")],
function(x) if (length(unique(x)) > 1) t.test(x)$conf.int else c(NA, NA))
nrow(outs)
# [1] 20
head(outs)
# Temperature Region Time x.1 x.2
# 1 35 Front 0 NA NA
# 2 37 Front 0 -9.706205 15.706205
# 3 39 Front 0 -2.853102 9.853102
# 4 41 Front 0 -15.559307 22.559307
# 5 35 Back 15 -15.559307 22.559307
# 6 37 Back 15 -4.853102 7.853102
Realize that this is not five columns; the fourth is really a matrix embedded in a frame column:
head(outs$x)
# [,1] [,2]
# [1,] NA NA
# [2,] -9.706205 15.706205
# [3,] -2.853102 9.853102
# [4,] -15.559307 22.559307
# [5,] -15.559307 22.559307
# [6,] -4.853102 7.853102
It's easy enough to extract:
outs$conf1 <- outs$x[,1]
outs$conf2 <- outs$x[,2]
outs$x <- NULL
head(outs)
# Temperature Region Time conf1 conf2
# 1 35 Front 0 NA NA
# 2 37 Front 0 -9.706205 15.706205
# 3 39 Front 0 -2.853102 9.853102
# 4 41 Front 0 -15.559307 22.559307
# 5 35 Back 15 -15.559307 22.559307
# 6 37 Back 15 -4.853102 7.853102
(If you're wondering why I have a conditional on length(unique(x)) > 1, then see what happens without it:
aggregate(data_l$Rating, data_l[,c("Temperature","Region","Time")],
function(x) t.test(x)$conf.int)
# Error in t.test.default(x) : data are essentially constant
This is because there are combinations with empty data. You'll likely see something similar with not-empty but still invariant data.)

I am stuck at looping through the combinations, and performing t-test
at each combo.
I'm not sure if this is what you wanted.
N <- 800
df <- data.frame(Participant=1:N,
Temperature=gl(4,200),
Region=sample(1:5, 800, TRUE),
Time=sample(1:5, 800, TRUE),
Rating=sample(1:5, 800, TRUE))
head(df)
t_test <- function(data, y, x){
x <- eval(substitute(x), data)
y <- eval(substitute(y), data)
comb <- combn(levels(x), m=2) # this gives all pair-wise combinations
n <- dim(comb)[2]
t <- vector(n, mode="list")
for(i in 1:n){
xlevs <- comb[,i]
DATA <- subset(data, subset=x %in% xlevs)
x2 <- factor(x, levels=xlevs)
tt <- t.test(y~x2, data=DATA)
t[[i]] <- tt
names(t)[i] <- toString(xlevs)
}
t
}
T.test <- t_test(df, Rating, Temperature)
T.test[1]
$`1, 2`
Welch Two Sample t-test
data: y by x2
t = -1.0271, df = 396.87, p-value = 0.305
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-0.4079762 0.1279762
sample estimates:
mean in group 1 mean in group 2
2.85 2.99

How to apply a formula one row at a time in R - row 2's values from calculated values of row 1

I have a data frame where I need to apply a formula to create new columns. The catch is, I need to calculate these numbers one row at a time. For eg,
df <- data.frame(c(1:10),c(21:30),5,10)
names(df) <- c('a','b','c','d')
I now need to create columns 'c' and 'd' as follows. Column 'c' whose R1 value is fixed as 5. But from R2 onwards the value of 'c' is calculated as (c (from previous row) - b(from previous row). Column 'd' R1 value is fixed as 10, but from R2 onwards, 'd' is calculated as 'c' from R2 - d from previous row.
I want my output to look like this:
A B C D
1 21 5 10
2 22 -16 -26
3 23 -38 -12
And so on. My actual data has over 1000 rows and 18 columns. For every row, 5 of the column values come from different columns of the previous row only (no other rows). And the rest of the column values are calculated from these newly calculated row values. I am quite at a loss in creating a formula that will apply my formulae to each row, calculate values for that row and then move to the next row. I know that I have simplified the problem a bit here, but this captures the essence of what I am attempting.
This is what I attempted:
df <- within(df, {
v1 <- shift(c)
v2 <- shift(d)
c <- v1-shift(b)
d <- c-v2
})
However, I need to apply this only from row 2 onwards and I have no idea how to do that.Because of that, I get something like this:
a b c d v2 v1
1 21 NA NA NA NA
2 22 4 -6 10 5
3 23 4 -6 10 5
I only get these values repeatedly for c, and d (4, -6, 10, 5).
Output
Thank you for your help.

df <- data.frame(a = 1:10, b = 21:30, c = 5:-4, d = 10)
for (i in (2:nrow(df))) {
df[i, "c"] <- df[i - 1, "c"] - df[i - 1, "b"]
df[i, "d"] <- df[i, "c"] - df[i - 1, "d"]
}
df[1:3, ]
a b c d
1 1 21 5 10
2 2 22 -16 -26
3 3 23 -38 -12
Edit: adapting to your comment
# Let's define the coefficients of the equations into a dataframe
equation1 <- c("c", 0, 0, 0, 0, 0, -1, 1, 0) # c (from previous row) - b(from previous row)
equation2 <- c("d", 0, 0, 1, 0, 0, 0, 0, -1) # d is calculated as 'c' from R2 - d from previous row
equations <- data.frame(rbind(equation1,equation2), stringsAsFactors = F)
names(equations) <- c("y","a","b","c","d","a_previous","b_previous","c_previous","d_previous")
equations
# y a b c d a_previous b_previous c_previous d_previous
# "c" 0 0 0 0 0 -1 1 0
# "d" 0 0 1 0 0 0 0 -1
# define function to mutiply the rows of the dataframes
sumProd <- function(vect1, vect2) {
return(as.numeric(as.numeric(vect1) %*% as.numeric(vect2)))
}
# Apply the formulas to the originaldataframe
for (i in (2:nrow(df))) {
for(e in 1:nrow(equations)) {
df[i, equations[e, 'y']] <- sumProd(equations[e, c('a','b','c','d')], df[i, c('a','b','c','d')]) +
sumProd(equations[e, paste0(c('a','b','c','d'),'_previous')], df[i - 1, c('a','b','c','d')])
}
}
df[1:3,]
a b c d
1 1 21 5 10
2 2 22 -16 -26
3 3 23 -38 -12

It might not be the most elegant way to do it with a for loop but it works. Your column c sounds like a simple sequence to me.
This is waht I would do:
df <- data.frame(c(1:10),c(21:30),5,10)
names(df) <- c('a','b','c','d')
# Use a simple sequence for c
df$c <- seq(5,5-(dim(df)[1]-1))
# Use for loop to calculate d
for(i in 2:(length(df$d)-1))
{
df$d[i] <- df$c[i] - df$d[i-1]
}
> df
a b c d
1 1 21 5 10
2 2 22 4 -6
3 3 23 3 9
4 4 24 2 -7
5 5 25 1 8
6 6 26 0 -8
7 7 27 -1 7
8 8 28 -2 -9
9 9 29 -3 6
10 10 30 -4 10

Create a dataframe from outliers in a function

OK so admittedly this is related to another question here but there has been no response and I suspect it is because I have made it too complex. So Im asking this question which is different because it is simplified. Happy to be scolded if this is not acceptable.
My core problem is that I want to create a dataframe by including outliers only from each column. The dataframe looks like:
chr leftPos TBGGT 12_try 324Gtt AMN2
1 24352 34 43 19 43
1 53534 2 1 -1 -9
2 34 -15 7 -9 -18
3 3443 -100 -4 4 -9
3 3445 -100 -1 6 -1
3 3667 5 -5 9 5
3 7882 -8 -9 1 3
I would like to calculate the upper and lower limit of each column (from the third onwards), exclude all rows that fall within the limits so I only keep outliers, and then end up with a dataframe as follows (for each column). This dataframe then gets passed to the next bit of the code (in the loop) but I wont elaborate on this for the sake of simplicity
chr leftPos TBGGT
2 34 -15
3 3443 -100
3 3445 -100
My code so far:
alpha= 1.5
f1 <- function(df, ZCol){
# Determine the UL and LL and then generate the Zoutliers
UL = median(ZCol, na.rm = TRUE) + alpha*IQR(ZCol, na.rm = TRUE)
LL = median(ZCol, na.rm = TRUE) - alpha*IQR(ZCol, na.rm = TRUE)
Zoutliers <- which(ZCol > UL | ZCol < LL)}
but this just gives me the outlier values without the chr and leftPos it is associated with. How do I get this?

Maybe this:
DF <- read.table(text=" chr leftPos TBGGT 12_try 324Gtt AMN2
1 24352 34 43 19 43
1 53534 2 1 -1 -9
2 34 -15 7 -9 -18
3 3443 -100 -4 4 -9
3 3445 -100 -1 6 -1
3 3667 5 -5 9 5
3 7882 -8 -9 1 3", header = TRUE)
#fix your function as explained by #Thilo
#also make alpha a parameter with default value
f1 <- function(ZCol, alpha = 1.5){
UL <- median(ZCol, na.rm = TRUE) + alpha*IQR(ZCol, na.rm = TRUE)
LL <- median(ZCol, na.rm = TRUE) - alpha*IQR(ZCol, na.rm = TRUE)
ZCol > UL | ZCol < LL
}
#loop over the columns and subset with the function's logical return values
outlist <- lapply(3:6, function(i, df) {
df[f1(df[,i]), c(1:2, i)]
}, df = DF)
#[[1]]
# chr leftPos TBGGT
#4 3 3443 -100
#5 3 3445 -100
#
#[[2]]
# chr leftPos X12_try
#1 1 24352 43
#
#[[3]]
# chr leftPos X324Gtt
#1 1 24352 19
#3 2 34 -9
#
#[[4]]
# chr leftPos AMN2
#1 1 24352 43

You did basically provide the answer yourself, your just missing the last final link.
Your function computes the limits you define for outliers. We change the result such that it returns a boolean vector that is true if the value is an outlier:
isOutlier <- function(values) {
# Determine the UL and LL
UL <- median(values, na.rm = TRUE) + alpha*IQR(values, na.rm = TRUE)
LL <- median(values, na.rm = TRUE) - alpha*IQR(values, na.rm = TRUE)
values > UL | values < LL # Return a boolean vector that can be used as a filter later on.
}
Now you can subset your data frame simply using this function, i.e.
AMN2.outliers <- subset(df, isOutlier(AMN2))
or
AMN2.outliers <- df[isOutlier(AMN2),]
whichever suites you more. Of course you could also wrap this line in the function, but for readability I prefere the solution above.
Besides: I would suggest using the <- operator for assignment instead of =. See here.

Easiest way to subset within a multidimensional range?

I apologize in advance if this has been asked before, or if I have missed something obvious.
I have two data sets, 'olddata' and 'newdata'
set.seed(0)
olddata <- data.frame(x = rnorm(10, 0,5), y = runif(10, 0, 5), z = runif(10,-10,10))
newdata <- data.frame(x = -5:5, z = -5:5)
I create a model from the old data, and want to predict values from the new data
mymodel <- lm(y ~ x+z, data = olddata)
predict.lm(mymodel, newdata)
However, I'd like to restrict the range of variables in 'newdata' to the range of variables on which the model was trained.
of course I could do this:
newnewdata <- subset(newdata,
x < max(olddata$x) & x > min(olddata$x) &
z < max(olddata$z) & z > max(olddata$z))
But this gets intractable over many dimensions. Is there a less repetitive way to do this?

It seems that all the values in your newdata are already within the appropriate ranges, so there's nothing there to subset. If we expand the ranges of newdata:
set.seed(0)
olddata <- data.frame(x = rnorm(10, 0,5), y = runif(10, 0, 5), z = runif(10,-10,10))
newdata <- data.frame(x = -10:10, z = -10:10)
newdata
x z
1 -10 -10
2 -9 -9
3 -8 -8
4 -7 -7
5 -6 -6
6 -5 -5
7 -4 -4
8 -3 -3
9 -2 -2
10 -1 -1
11 0 0
12 1 1
13 2 2
14 3 3
15 4 4
16 5 5
17 6 6
18 7 7
19 8 8
20 9 9
21 10 10
Then all we need to do is identify the ranges for each variable of olddata and then loop through as many iterations of subset as newdata has columns:
ranges <- sapply(olddata, range, na.rm = TRUE)
for(i in 1:ncol(newdata)) {
col_name <- colnames(newdata)[i]
newdata <- subset(newdata,
newdata[,col_name] >= ranges[1, col_name] &
newdata[,col_name] <= ranges[2, col_name])
}
newdata
x z
4 -7 -7
5 -6 -6
6 -5 -5
7 -4 -4
8 -3 -3
9 -2 -2
10 -1 -1
11 0 0
12 1 1
13 2 2
14 3 3
15 4 4
16 5 5
17 6 6

Here is an approach using the *apply family (using SchaunW's newdata):
set.seed(0)
olddata <- data.frame(x = rnorm(10, 0, 5), y = runif(10, 0, 5), z = runif(10,-10,10))
newdata <- data.frame(x = -10:10, z = -10:10)
minmax <- sapply(olddata[-2], range)
newdata[apply(newdata, 1, function(a) all(a > minmax[1,] & a < minmax[2,])), ]
Some care is required because I have assumed the columns of olddata (after dropping the second column) are identical to newdata.
Brevity comes at the cost of speed. After increasing nrow(newdata) to 2000 to emphasis the difference I found:
test replications elapsed relative user.self sys.self user.child sys.child
1 orizon() 100 2.193 27.759 2.191 0.002 0 0
2 SchaunW() 100 0.079 1.000 0.075 0.004 0 0
My guess at the main cause is that repeated subsetting avoids testing whether rows meet the criteria examined after they are excluded.

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Only include outliers from each column in a dataframe - r

Related

R: How to fill out values in a DF which are dependent on previous rows

t-test through all combinations of all factors all levels

How to apply a formula one row at a time in R - row 2's values from calculated values of row 1

Create a dataframe from outliers in a function

Easiest way to subset within a multidimensional range?

Categories

Resources