How to calculate a value basing on several variables? - r

I have a dataset (df) like this:
Iso conc. rep time OD
1 1 1 0 0.2
1 1.5 2 0 0.2
1 2 3 0 0.2
2 1 1 0 0.3
2 1.5 2 0 0.25
2 2 3 0 0.3
1 1 1 1 0.4
1 1.5 2 1 0.35
1 2 3 1 0.38
2 1 1 1 0.4
2 1.5 2 1 0.45
2 2 3 1 0.43
And I want to get the result growth=OD(time=1)-OD(time=0) basing on Iso, conc, and rep.
The output would be like this:
Iso conc. rep time growth
1 1 1 1 0.2
1 1.5 2 1 0.15
1 2 3 1 0.18
2 1 1 1 0.1
2 1.5 2 1 0.2
2 2 3 1 0.13
I have been thinking to use data.table to calculate the growth.
DT <- as.data.table(df)
DT[, , by = .(Iso,conc.,rep,set)]
But I don't know how to write the part before two comma. Could somebody help me?

Using data.table you can simply do:
dt[,.(growth = OD[time==1]-OD[time==0]),.(Iso,conc.,rep)]
# Iso conc. rep growth
#1: 1 1.0 1 0.20
#2: 1 1.5 2 0.15
#3: 1 2.0 3 0.18
#4: 2 1.0 1 0.10
#5: 2 1.5 2 0.20
#6: 2 2.0 3 0.13

You can do this with:
DT [, list(growth = OD[time == 1] - OD[time == 0]), by=.(Iso,conc.,rep)]
Or alternatively, if you are sure there are only two values in each group:
DT [order(time), list(growth = diff(OD), by=.(Iso,conc.,rep)]

Related

Quartile sorter with externally specified quartile breakpoints in R data.table

I want to sort observations into quartiles on the variable "varbl". Since my data is pretty big (2Gb), I am trying to implement it via data.table. The problem is that I need to use external quartile breaks, which are group-specific. The group variable is "prd" or "prd1".
My data and breakpoints are as follows:
data <- data.table(id = c(1,2,3,4,5,1,2,3,4,5), prd1 = c(1,1,1,1,1,2,2,2,2,2), varbl = c(-1.6, -0.7, 0.1, 1.2, -0.5, -0.8, 0.4, 1.2, 1.9, 4))
bks <- data.table(prd=c(1,2), br0 = c(-5,-5), br1=c(-1,0), br2=c(0, 0.5), br3=c(1, 3), br4=c(5,5))
> data
id prd1 varbl
1: 1 1 -1.6
2: 2 1 -0.7
3: 3 1 0.1
4: 4 1 1.2
5: 5 1 -0.5
6: 1 2 -0.8
7: 2 2 0.4
8: 3 2 1.2
9: 4 2 1.9
10: 5 2 4.0
> bks
prd br0 br1 br2 br3 br4
1: 1 -5 -1 0.0 1 5
2: 2 -5 0 0.5 3 5
The desired output is:
> output
id prd1 varbl ntile
1: 1 1 -1.6 1
2: 2 1 -0.7 2
3: 3 1 0.1 3
4: 4 1 1.2 4
5: 5 1 -0.5 2
6: 1 2 -0.8 1
7: 2 2 0.4 2
8: 3 2 1.2 3
9: 4 2 1.9 3
10: 5 2 4.0 4
I tried the following code, but it fails, since I can not subset bks on the same prd as the current prd1 from data:
data[, ntile := cut(varbl, breaks = bks[prd==prd1], include.lowest=TRUE, labels = 1:4)]
As another attempt, I tried to join data and bks first (I would prefer not to as it will increase the size of data from 2Gb to 4Gb)
and then sort observations into quantiles. It fails, since I can not understand how to use column names to construct a vector of breakpoints for every row. None of the attempts worked.
setnames(data, "prd1", "prd")
data <- data[bks, on="prd", nomatch=0]
data[, ntile := cut(varbl, breaks = .(br0, br1, br2, br3, br4), include.lowest=TRUE, labels=1:4)]
data[, ntile := cut(varbl, breaks = colnames(bks)[-1], include.lowest=TRUE, labels=1:4)]
data[, ntile := cut(varbl, breaks = c("br0", "br1", "br2", "br3", "br4"), include.lowest=TRUE, labels=1:4)]
Rearranging bks a little means you can do this as a join:
bks <- bks[, data.frame(embed(unlist(.SD),2)[,2:1]), by=prd]
bks[, grp := seq_len(.N), by=prd]
# prd X1 X2 grp
#1: 1 -5.0 -1.0 1
#2: 1 -1.0 0.0 2
#3: 1 0.0 1.0 3
#4: 1 1.0 5.0 4
#5: 2 -5.0 0.0 1
#6: 2 0.0 0.5 2
#7: 2 0.5 3.0 3
#8: 2 3.0 5.0 4
data[bks, on=c("prd1"="prd","varbl>=X1","varbl<X2"), grp := i.grp]
# id prd1 varbl grp
# 1: 1 1 -1.6 1
# 2: 2 1 -0.7 2
# 3: 3 1 0.1 3
# 4: 4 1 1.2 4
# 5: 5 1 -0.5 2
# 6: 1 2 -0.8 1
# 7: 2 2 0.4 2
# 8: 3 2 1.2 3
# 9: 4 2 1.9 3
#10: 5 2 4.0 4

creating time to improvement of +1 variable in r?

I want to create a "time to improvement of +1" variable I have a longitudinal in a long format at baseline, 3, 6 and 9 months. How do I go about it in r? The improvement from the baseline.
The data is like this:
sno time WHZ
1 0 -0.5
1 3 1.4
1 6 -0.7
1 9 2.2
2 0 -0.63
2 3 0.7
2 6 -2.64
2 9 2.1
expected output
sno time WHZ impr First time to imp
1 0 -0.5 0 3
1 3 1.4 1.9 3
1 6 -0.7 -0.2 3
1 9 2.2 2.7 3
2 0 -0.63 0 3
2 3 0.7 1.33 3
2 6 -2.64 -2.01 3
2 9 2.1 2.73 3
Codes I was trying to use to first create the improvement variable:
library(dplyr)
data %>%
group_by(sno)%>%
mutate(ImprvWHZ = data$WHZ - lag(data$WHZ, default = data$WHZ[1]))
If I understand the question correctly, here is a dplyr solution.
library(dplyr)
dat %>%
group_by(sno) %>%
mutate(Improv = WHZ - WHZ[1],
TimeToImprov = ifelse(Improv > 1, time - time[1], NA))
## A tibble: 8 x 5
## Groups: sno [2]
# sno time WHZ Improv TimeToImprov
# <int> <int> <dbl> <dbl> <int>
#1 1 0 -0.5 0 NA
#2 1 3 1.4 1.9 3
#3 1 6 -0.7 -0.200 NA
#4 1 9 2.2 2.7 9
#5 2 0 -0.63 0 NA
#6 2 3 0.7 1.33 3
#7 2 6 -2.64 -2.01 NA
#8 2 9 2.1 2.73 9
And here is a base R solution.
res <- lapply(split(dat, dat$sno), function(DF){
DF$Improv <- DF$WHZ - DF$WHZ[1]
DF$TimeToImprov <- ifelse(DF$Improv > 1, DF$time - DF$time[1], NA)
DF
})
res <- do.call(rbind, res)
row.names(res) <- NULL
res
# sno time WHZ Improv TimeToImprov
#1 1 0 -0.50 0.00 NA
#2 1 3 1.40 1.90 3
#3 1 6 -0.70 -0.20 NA
#4 1 9 2.20 2.70 9
#5 2 0 -0.63 0.00 NA
#6 2 3 0.70 1.33 3
#7 2 6 -2.64 -2.01 NA
#8 2 9 2.10 2.73 9
DATA.
dat <- read.table(text = "
sno time WHZ
1 0 -0.5
1 3 1.4
1 6 -0.7
1 9 2.2
2 0 -0.63
2 3 0.7
2 6 -2.64
2 9 2.1
", header = TRUE)

How do I sort one vector based on values of another (with data.frame)

I have a data frame ‘true set’, that I would like to sort based on the order of values in vectors ‘order’.
true_set <- data.frame(dose1=c(rep(1,5),rep(2,5),rep(3,5)), dose2=c(rep(1:5,3)),toxicity=c(0.05,0.1,0.15,0.3,0.45,0.1,0.15,0.3,0.45,0.55,0.15,0.3,0.45,0.55,0.6),efficacy=c(0.2,0.3,0.4,0.5,0.6,0.4,0.5,0.6,0.7,0.8,0.5,0.6,0.7,0.8,0.9),d=c(1:15))
orders<-matrix(nrow=3,ncol=15)
orders[1,]<-c(1,2,6,3,7,11,4,8,12,5,9,13,10,14,15)
orders[2,]<-c(1,6,2,3,7,11,12,8,4,5,9,13,14,10,15)
orders[3,]<-c(1,6,2,11,7,3,12,8,4,13,9,5,14,10,15)
The expected result would be:
First orders[1,] :
dose1 dose2 toxicity efficacy d
1 1 1 0.05 0.2 1
2 1 2 0.10 0.3 2
3 2 1 0.10 0.4 6
4 1 3 0.15 0.4 3
5 2 2 0.15 0.5 7
6 3 1 0.15 0.5 11
7 1 4 0.30 0.5 4
8 2 3 0.30 0.6 8
9 3 2 0.30 0.6 12
10 1 5 0.45 0.6 5
11 2 4 0.45 0.7 9
12 3 3 0.45 0.7 13
13 2 5 0.55 0.8 10
14 3 4 0.55 0.8 14
15 3 5 0.60 0.9 15
First orders[2,] : as above
First orders[3,] : as above
true_set <- data.frame(dose1=c(rep(1,5),rep(2,5),rep(3,5)), dose2=c(rep(1:5,3)),toxicity=c(0.05,0.1,0.15,0.3,0.45,0.1,0.15,0.3,0.45,0.55,0.15,0.3,0.45,0.55,0.6),efficacy=c(0.2,0.3,0.4,0.5,0.6,0.4,0.5,0.6,0.7,0.8,0.5,0.6,0.7,0.8,0.9),d=c(1:15))
orders<-matrix(nrow=3,ncol=15)
orders[1,]<-c(1,2,6,3,7,11,4,8,12,5,9,13,10,14,15)
orders[2,]<-c(1,6,2,3,7,11,12,8,4,5,9,13,14,10,15)
orders[3,]<-c(1,6,2,11,7,3,12,8,4,13,9,5,14,10,15)
# Specify your order set in the row dimension
First_order <- true_set[orders[1,],]
Second_order <- true_Set[orders[2,],]
Third_order <- true_Set[orders[3,],]
# If you want to store all orders in a list, you can try the command below:
First_orders <- list(First_Order=true_set[orders[1,],],Second_Order=true_set[orders[2,],],Third_Order=true_set[orders[3,],])
First_orders[1] # OR First_orders$First_Order
First_orders[2] # OR First_orders$Second_Order
First_orders[3] # OR First_orders$Third_Order
# If you want to combine the orders column wise, try the command below:
First_orders <- cbind(First_Order=true_set[orders[1,],],Second_Order=true_set[orders[2,],],Third_Order=true_set[orders[3,],])
# If you want to combine the orders row wise, try the command below:
First_orders <- rbind(First_Order=true_set[orders[1,],],Second_Order=true_set[orders[2,],],Third_Order=true_set[orders[3,],])

Reshape matrix to data frame

I have association matrix file that looks like this (4 rows and 3 columns) .
test=read.table("test.csv", sep=",", header=T)
head(test)
LosAngeles SanDiego Seattle
1 2 3
A 1 0.1 0.2 0.2
B 2 0.2 0.4 0.2
C 3 0.3 0.5 0.3
D 4 0.2 0.5 0.1
What I want to is reshape this matrix file into data frame. The result should look something like this (12(= 4 * 3) rows and 3 columns):
RowNum ColumnNum Value
1 1 0.1
2 1 0.2
3 1 0.3
4 1 0.2
1 2 0.2
2 2 0.4
3 2 0.5
4 2 0.5
1 3 0.2
2 3 0.2
3 3 0.3
4 3 0.1
That is, if my matrix file has 100 rows and 90 columns. I want to make new data frame file that contains 9000 (= 100 * 90) rows and 3 columns. I've tried to use reshape package but but I do not seem to be able to get it right. Any suggestions how to solve this problem?
Use as.data.frame.table. Its the boss:
m <- matrix(data = c(0.1, 0.2, 0.2,
0.2, 0.4, 0.2,
0.3, 0.5, 0.3,
0.2, 0.5, 0.1),
nrow = 4, byrow = TRUE,
dimnames = list(row = 1:4, col = 1:3))
m
# col
# row 1 2 3
# 1 0.1 0.2 0.2
# 2 0.2 0.4 0.2
# 3 0.3 0.5 0.3
# 4 0.2 0.5 0.1
as.data.frame.table(m)
# row col Freq
# 1 1 1 0.1
# 2 2 1 0.2
# 3 3 1 0.3
# 4 4 1 0.2
# 5 1 2 0.2
# 6 2 2 0.4
# 7 3 2 0.5
# 8 4 2 0.5
# 9 1 3 0.2
# 10 2 3 0.2
# 11 3 3 0.3
# 12 4 3 0.1
This should do the trick:
test <- as.matrix(read.table(text="
1 2 3
1 0.1 0.2 0.2
2 0.2 0.4 0.2
3 0.3 0.5 0.3
4 0.2 0.5 0.1", header=TRUE))
data.frame(which(test==test, arr.ind=TRUE),
Value=test[which(test==test)],
row.names=NULL)
# row col Value
#1 1 1 0.1
#2 2 1 0.2
#3 3 1 0.3
#4 4 1 0.2
#5 1 2 0.2
#6 2 2 0.4
#7 3 2 0.5
#8 4 2 0.5
#9 1 3 0.2
#10 2 3 0.2
#11 3 3 0.3
#12 4 3 0.1

Average matrix of different size by row names and column names

How to average different matrices (might be more than 2) that have different sizes? Rownames and colnames would serve to do the matching:
x1<- as.data.frame((matrix(c(0,0,1,0,2,3,1,1,1),nrow=3)))
colnames(x1)<-c('1','3','4')
rownames(x1)<- c('1','2','4')
> x1
1 3 4
1 0 0 1
2 0 2 1
4 1 3 1
x2<- as.data.frame((matrix(c(1,1,2,2,0,0,0,2,1,1,0,1,1,0,1,1),nrow=4)))
colnames(x2)<-c('1','2','3','4')
rownames(x2)<- c('1','2','3','4')
>x2
1 2 3 4
1 1 0 1 1
2 1 0 1 0
3 2 0 0 1
4 2 2 1 1
x3<- matrix(c(0.5,0,0.5,1 ,0.5,0,1,0.5, 2,0,0,1, 1.5,2,1.5,1),nrow=4, byrow = T)
colnames(x3)<-c('1','2','3','4')
rownames(x3)<- c('1','2','3','4')
The result would look like:
x3 <- averageMatDiffSizes(x1,x2)
x3
1 2 3 4
1 0.5 0 0.5 1.0
2 0.5 0 1.0 0.5
3 2.0 0 0.0 1.0
4 1.5 2 1.5 1.0
You could do:
xNew <- matrix(ncol=ncol(x2), nrow=nrow(x2), dimnames=list(1:4,1:4)) #as x2 is the larger dimension in the example.
# If both have different sizes and some columns are missing in both
# Create using ?union(). e.g. ncol=length(union(colnames(x1),colnames(x2))), similarly
# for nrow.
indx <- outer(rownames(x1),colnames(x1), paste)
indx1 <- outer(rownames(x2),colnames(x2), paste)
xNew[match(indx,indx1)] <- unlist(x1)
lst <- list(xNew, x2)
x3 <- do.call(`+`, lst)/length(lst)
x3[is.na(x3)] <- unlist(x2)[!indx1%in% indx]
x3
# 1 2 3 4
# 1 0.5 0 0.5 1.0
# 2 0.5 0 1.5 0.5
# 3 2.0 0 0.0 1.0
# 4 1.5 2 2.0 1.0

Resources