I have a matrix as shown and I want to extract from it an other matrix where without any duplicated element in each row.
This is the input matrix
head(Data_Achat2)
ID_Achat 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
1 1349 433 405 451 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2 4890 405 405 416 416 388 464 416 388 392 405 393 405 433 453 392 416 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3 7881 405 384 390 395 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
4 8081 442 405 405 475 464 405 442 405 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
5 9465 457 417 416 391 441 441 392 441 401 441 432 388 395 466 464 399 475 466 464 481 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
6 10626 432 390 433 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
In other word I want to get for example for the second row like this:
2 4890 405 416 388 464 388 392 393 433 453
Then, each row of the new matrix has only distincts element of the input one and all of results is in matrix (which include also 0 values for missing values).
I would row-wise apply a function that only retains the m unique values and then "pad" that vector to a length N with zeros, by adding N - m zeros to the unique values:
N <- ncol(Data_Achat2)
t(apply(Data_Achat2, 1, function(x){
uniques <- unique(x)
return(c(uniques, rep(0, N-length(uniques))))
}))
Which results in:
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16] [,17] --- [,36] [,37]
1 1349 433 405 451 0 0 0 0 0 0 0 0 0 0 0 0 0 --- 0 0
2 4890 405 416 388 464 392 393 433 453 0 0 0 0 0 0 0 0 --- 0 0
3 7881 405 384 390 395 0 0 0 0 0 0 0 0 0 0 0 0 --- 0 0
4 8081 442 405 475 464 0 0 0 0 0 0 0 0 0 0 0 0 --- 0 0
5 9465 457 417 416 391 441 392 401 432 388 395 466 464 399 475 481 0 --- 0 0
6 10626 432 390 433 0 0 0 0 0 0 0 0 0 0 0 0 0 --- 0 0
Hi I've got a function which performs calculations on several columns (written with kind support form stack overflow community). I'd like to adapt it so that instead of calculating MOD for each iteration, it looks up a value in a table and uses that. The MOD for each column will be different.
Here is the function that does the calculations:
library(data.table)
dataDT<- data.frame(CAG = c(13, 14, 15, 17),
A01 = c(6485,35,132, 12),
A02 = c(0,42,56, 4))
thres <- 0.2
dataDT<-setDT(dataDT)
colsToBeUsed<-names(dataDT[,!'CAG'])
sumDataSetdata<-c()
sumDataSet<-unlist(lapply(X=1:length(colsToBeUsed),function(X){s=colsToBeUsed[X]
eval(parse(text=paste0('dataDT[',s,'<thres*max(',s,'),',s,':=0]')))
eval(parse(text=paste0('dataDT[,MOD',s,':=dataDT[',s,'==max(',s,'),CAG]]')))
eval(parse(text=paste0('dataDT[,norm',s,':=',s,'/sum(',s,')]')))
eval(parse(text=paste0('dataDT[,sum',s,':=',s,'/sum(',s,')*(CAG-MOD',s,'),]')))
eval(parse(text=paste0('rbind(sumDataSetdata,dataDT[,sum(sum',s,')])')))
}))
Here is the table which gives the MOD:
MODs <- data.frame(c(data.frame(samples = c('A01', 'A02', 'A03', 'A04'), MOD = c(117.8, 120.2, 124.5, 130.6))
Here is the table which says which MOD to use for each 'sample' column
ctrls <- (data.frame(samples = c('A01', 'A02', 'A03', 'A04'), ctrl = c(A01, A01, A03, A03))
Response to answer 1
Thank you, that works well for the example. I've been trying to apply it to my real data and am having a few difficulties. Here is the code for my real data.
library(data.table)
dataDT <- data.frame(area[,7:ncol(height)])
dataDT <- setDT(dataDT)
colsToBeUsed<-names(dataDT[,!'CAG'])
MODs <- data.frame(samples = samples$unique.inputdf.SampleFileName., MOD = htresults$mode)
ctrls <- data.frame(samples = samples$unique.inputdf.SampleFileName., ctrl = 'A01_RR20170609_FA_A01_2017-06-09_1.fsa')
myFun <- function(x, mod, cag, thres) {
x[x < (thres * max(x))] <- 0
norm_x <- x / sum(x)
sum_x <- norm_x * (cag - mod)
sum(sum_x)
}
transision_matrix <- merge(ctrls, MODs, by.x = "ctrl", by.y = "samples")
setDT(transision_matrix)
mf2 <- function(colname, dataDT, transision_matrix){
x <- dataDT[, colname, with = F][[1]]
mod <- transision_matrix[samples == colname, MOD]
cag <- dataDT[, "CAG"][[1]]
myFun(x, mod, cag, thres = 0.2)
}
sapply(colsToBeUsed, function(x) mf2(x, dataDT, transision_matrix))
The control sample for all columns in this experiment is A01_RR20170609_FA_A01_2017.06.09_1.fsa and its mode is 20.67000
This is the result I get
A01_RR20170609_FA_A01_2017.06.09_1.fsa A02_RR20170609_FA_A02_2017.06.09_1.fsa
0 0
A03_RR20170609_FA_A03_2017.06.09_1.fsa A04_RR20170609_FA_A04_2017.06.09_1.fsa
0 0
A05_RR20170609_FA_A05_2017.06.09_1.fsa A06_RR20170609_FA_A06_2017.06.09_1.fsa
0 0
A07_RR20170609_FA_A07_2017.06.09_1.fsa A08_RR20170609_FA_A08_2017.06.09_1.fsa
0 0
A09_RR20170609_FA_A09_2017.06.09_1.fsa A10_RR20170609_FA_A10_2017.06.09_1.fsa
0 0
A11_RR20170609_FA_A11_2017.06.09_1.fsa A12_RR20170609_FA_A12_2017.06.09_1.fsa
0 0
The results I'm expecting are:
[1] 4.108246 5.868355 4.608756 -1.159657 4.015066 4.364199 5.262355 4.337760 6.496672 5.574396
[11] 5.102111 8.911440
In case it's useful, here is info about the table 'height':
'data.frame': 660 obs. of 19 variables:
$ Dye/SamplePeak : chr "B,66" "B,67" "B,68" "B,69" ...
$ Marker : chr NA NA NA NA ...
$ Allele : chr NA NA NA NA ...
$ Size : num 144 147 148 150 151 ...
$ Area : num 148288 110 907 3355 1274 ...
$ DataPoint : num 2591 2622 2641 2655 2671 ...
$ CAG : num 13.9 14.9 15.5 15.9 16.4 ...
$ A01_RR20170609_FA_A01_2017-06-09_1.fsa: num 6485 32 125 450 211 ...
$ A02_RR20170609_FA_A02_2017-06-09_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ A03_RR20170609_FA_A03_2017-06-09_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ A04_RR20170609_FA_A04_2017-06-09_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ A05_RR20170609_FA_A05_2017-06-09_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ A06_RR20170609_FA_A06_2017-06-09_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ A07_RR20170609_FA_A07_2017-06-09_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ A08_RR20170609_FA_A08_2017-06-09_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ A09_RR20170609_FA_A09_2017-06-09_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ A10_RR20170609_FA_A10_2017-06-09_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ A11_RR20170609_FA_A11_2017-06-09_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ A12_RR20170609_FA_A12_2017-06-09_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
The table 'height' is quite large, but I've added the results from column 'CAG' and 'A01_RR20170609_FA_A01_2017.06.09_1.fsa' in case it's useful for working.
**CAG**
[1] 13.92667 14.88000 15.46333 15.89333 16.38333 16.84333 17.39667 17.79667 18.41000 18.78000 19.39333
[12] 19.76000 20.33667 20.67000 21.03333 21.36667 23.52000 24.45667 25.39667 27.31000 28.34667 29.32000
[23] 30.23333 31.17667 32.15000 32.72667 33.08333 33.67667 34.03667 34.60667 34.96667 35.54000 35.93333
[34] 36.54000 36.87333 37.49333 37.87333 38.50667 38.89000 39.48333 39.84667 40.45000 40.81000 41.41000
[45] 41.73667 42.45333 42.77667 43.34000 43.69333 46.21000 49.21333 52.21667 52.60667 13.90000 14.95000
[56] 15.47333 15.99667 16.46000 16.86333 17.48000 17.85000 18.43667 18.80667 19.42667 19.76333 20.43667
[67] 21.38667 21.72333 22.70000 23.31333 23.61667 24.23000 24.59667 25.60333 26.55333 27.16333 27.44000
[78] 28.42000 29.40000 30.37667 30.98667 31.32333 32.30000 33.23667 34.16667 35.13333 36.07667 37.05667
[89] 38.03333 38.63333 39.01667 39.60667 39.97333 40.57667 40.94000 41.54333 41.87333 42.47333 42.83333
[100] 43.46000 43.78667 44.70333 12.48333 13.03667 13.92667 14.88000 15.49333 15.86333 16.44667 16.78333
[111] 17.36667 17.82667 18.38000 18.84333 19.42333 19.79000 20.34000 20.79667 21.28333 21.74333 23.60000
[122] 24.54000 25.51333 26.48667 27.43333 28.40667 29.32000 29.96000 30.26333 30.90333 31.36000 32.24000
[133] 33.17333 34.25667 35.40667 36.01667 36.56667 36.99667 37.58667 37.97000 38.60333 38.98333 39.57667
[144] 39.94000 40.54667 40.90667 41.51000 41.86667 42.43667 42.79333 43.38667 43.74000 12.63667 13.95667
[155] 15.52333 15.92333 16.53667 16.90667 17.49000 17.89000 18.44333 18.81000 19.42333 19.79000 20.40000
[166] 20.76667 21.37667 21.74333 22.38000 24.54000 25.51333 27.46333 28.43667 29.04667 29.38000 30.29333
[177] 31.26667 32.24000 33.17333 34.10667 35.04333 35.68000 35.98667 36.59667 36.96333 37.55667 37.97000
[188] 38.60333 38.95333 39.57667 39.94000 40.51333 40.87333 41.47333 41.83333 42.43000 42.75667 43.35000
[199] 43.70667 44.29667 44.65000 45.24000 45.56000 46.54333 47.54333 12.45333 13.04000 13.87333 14.86667
[210] 15.48667 15.86000 16.44667 16.79000 17.37667 17.81333 18.43333 18.80667 19.42667 19.79667 20.41333
[221] 20.75000 21.36667 21.73333 22.59667 23.54667 24.53000 25.51000 26.46000 27.44000 28.38667 29.00000
[232] 29.33667 29.91667 30.28333 31.26333 31.84333 33.08667 34.08000 34.68667 35.08333 35.97333 36.37333
[243] 36.96000 37.59000 37.94667 38.59000 38.98000 39.57667 39.94333 40.55000 40.88333 41.48667 41.85000
[254] 42.42000 42.77667 43.37333 43.73000 44.29333 44.64667 45.26667 45.56000 12.57667 12.94000 13.55333
[265] 13.92000 14.50000 14.86667 15.41667 15.87667 16.51667 16.88667 17.49667 17.86333 18.47667 18.81333
[276] 19.42333 19.79000 20.40000 20.76667 21.40667 21.74333 23.60000 27.42667 28.94333 29.76333 30.33667
[287] 32.21333 33.17000 34.09333 34.57333 35.05667 35.69000 36.05333 36.69000 36.99667 37.58667 37.97000
[298] 38.60333 38.98333 39.60333 39.96333 40.56333 40.92000 41.48667 41.84333 42.43333 42.78667 43.40667
[309] 43.70000 12.05333 12.51333 12.91333 13.90000 14.48667 14.85667 15.47667 15.78333 16.40000 16.83333
[320] 17.48000 17.88333 18.47000 18.81000 19.39333 19.79000 20.40000 20.76667 21.34667 21.71000 22.26000
[331] 22.62333 23.56667 24.57333 26.43000 27.99333 28.66667 29.40000 31.29333 32.30000 33.17000 34.09000
[342] 34.69000 35.08000 35.65333 36.02000 36.23000 36.93333 37.52667 37.94667 38.59000 38.94667 39.57667
[353] 39.94000 40.54333 40.90333 41.47333 41.83333 42.42667 42.78333 43.40333 43.72667 52.24333 52.57667
[364] 12.02667 12.30000 13.73667 14.99333 15.57333 15.94000 16.52333 16.92000 17.44000 17.96000 18.44667
[375] 18.81333 19.45333 19.81667 20.42000 20.78333 21.35667 21.66000 22.41333 24.52333 25.52000 26.45333
[386] 27.42667 28.06333 28.33333 28.94000 29.36333 29.91000 30.30333 30.88000 31.24333 31.85000 32.18333
[397] 33.14333 33.77000 34.13000 34.70000 35.03000 35.66333 36.02667 36.63333 37.00000 37.58333 37.99000
[408] 38.58333 38.96000 39.57333 39.93333 40.53000 40.88667 41.48000 41.83667 42.43000 42.78333 43.37000
[419] 43.69333 44.36667 44.69000 45.27333 45.65000 46.24000 46.54000 50.86667 51.52667 12.42333 13.03333
[430] 13.85667 14.47000 14.86667 15.47667 15.84667 16.39667 16.70000 17.40667 17.80333 19.21000 19.79000
[441] 20.18667 20.73667 21.31667 21.71000 22.20000 22.59333 23.56667 24.51000 25.48333 26.42667 27.03333
[452] 27.33667 28.33667 29.30667 29.94333 30.91333 31.91000 32.78667 33.64667 34.00333 34.63333 34.99333
[463] 35.59667 35.99333 36.54000 36.93667 37.52333 37.90333 38.54000 38.92000 39.54667 39.90667 40.51000
[474] 40.84000 41.44000 41.79667 42.39333 42.74667 43.31000 43.69333 44.22000 52.21667 52.60667 12.27000
[485] 13.79333 14.95000 15.44000 16.01667 16.44333 16.84000 17.47667 17.84333 18.45000 18.81667 19.42333
[496] 19.69667 20.12000 20.48000 21.38667 21.68667 22.59333 23.52667 24.52333 25.12667 25.45667 26.45667
[507] 26.97333 27.39667 28.33667 28.97333 29.36667 29.91333 30.27667 31.24667 31.82000 32.18333 33.14000
[518] 34.06000 34.65667 35.01667 35.61333 35.97333 36.54667 36.94000 37.52000 37.92667 38.55333 38.92667
[529] 39.51333 39.87000 40.46667 40.82000 41.41333 41.76667 42.35667 42.71000 43.29667 43.64667 44.23000
[540] 44.58000 45.21667 45.53667 46.15000 46.51000 54.90333 12.13000 12.51667 13.81333 14.84000 15.44333
[551] 15.89333 16.74000 17.37333 17.76333 18.33667 18.73000 19.30333 19.60333 20.37667 21.30333 21.63000
[562] 22.52333 23.50667 24.48667 25.46667 26.38667 26.92333 27.34000 27.90667 28.29333 29.24333 30.22667
[573] 30.79333 31.24000 31.83333 32.43000 32.93000 33.46000 34.02333 34.67333 34.97000 35.53667 35.92333
[584] 36.52333 36.88333 37.48667 37.85667 38.50333 38.87333 39.45333 39.83667 40.42667 40.81000 41.36667
[595] 41.71667 42.33000 42.68000 43.26333 43.61000 44.56333 48.30000 12.46000 12.84667 13.83333 14.43000
[606] 14.82000 15.35667 15.65667 16.40333 17.42000 17.78000 18.55667 18.88333 19.30333 19.69000 20.31333
[617] 20.64000 21.23333 21.61667 22.21000 22.47667 23.45667 24.04667 24.40333 24.99333 25.35000 25.94000
[628] 26.29667 26.92000 27.27667 27.84000 28.25667 28.85000 29.20333 30.15333 31.13000 31.72000 32.10667
[639] 32.69667 33.04333 33.91667 34.91000 35.91000 36.44333 36.79667 37.39333 37.79333 38.41000 38.78000
[650] 39.39333 39.77333 40.36000 40.71000 41.29000 41.64000 42.24667 42.59333 43.54667 44.49000 45.40333
A01_RR20170609_FA_A01_2017.06.09_1.fsa
[1] 6485 32 125 450 211 703 553 1549 1360 3526 5028 13610 15986 31233 713 1260
[17] 31 37 33 46 43 48 40 63 78 33 118 40 176 65 296 103
[33] 501 242 923 545 2006 1355 4348 2564 8615 3886 12985 227 669 85 61 57
[49] 103 32 42 50 64 0 0 0 0 0 0 0 0 0 0 0
[65] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[81] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[97] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[113] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[129] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[145] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[161] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[177] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[193] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[209] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[225] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[241] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[257] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[273] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[289] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[305] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[321] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[337] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[353] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[369] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[385] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[401] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[417] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[433] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[449] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[465] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[481] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[497] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[513] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[529] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[545] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[561] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[577] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[593] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[609] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[625] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[641] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[657] 0 0 0 0
Response to answer 2
dputs as requested
dput(transision_matrix)
structure(list(ctrl = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = "A01_RR20170609_FA_A01_2017-06-09_1.fsa", class = "factor"),
samples = structure(1:12, .Label = c("A01_RR20170609_FA_A01_2017-06-09_1.fsa",
"A02_RR20170609_FA_A02_2017-06-09_1.fsa", "A03_RR20170609_FA_A03_2017-06-09_1.fsa",
"A04_RR20170609_FA_A04_2017-06-09_1.fsa", "A05_RR20170609_FA_A05_2017-06-09_1.fsa",
"A06_RR20170609_FA_A06_2017-06-09_1.fsa", "A07_RR20170609_FA_A07_2017-06-09_1.fsa",
"A08_RR20170609_FA_A08_2017-06-09_1.fsa", "A09_RR20170609_FA_A09_2017-06-09_1.fsa",
"A10_RR20170609_FA_A10_2017-06-09_1.fsa", "A11_RR20170609_FA_A11_2017-06-09_1.fsa",
"A12_RR20170609_FA_A12_2017-06-09_1.fsa"), class = "factor"),
MOD = c(20.67, 20.67, 20.67, 20.67, 20.67, 20.67, 20.67,
20.67, 20.67, 20.67, 20.67, 20.67)), .Names = c("ctrl", "samples",
"MOD"), row.names = c(NA, -12L), class = c("data.table", "data.frame"
), .internal.selfref = <pointer: 0x10180d178>, index = structure(integer(0), "`__samples`" = integer(0)))
> dput(ctrls)
structure(list(samples = structure(1:12, .Label = c("A01_RR20170609_FA_A01_2017-06-09_1.fsa",
"A02_RR20170609_FA_A02_2017-06-09_1.fsa", "A03_RR20170609_FA_A03_2017-06-09_1.fsa",
"A04_RR20170609_FA_A04_2017-06-09_1.fsa", "A05_RR20170609_FA_A05_2017-06-09_1.fsa",
"A06_RR20170609_FA_A06_2017-06-09_1.fsa", "A07_RR20170609_FA_A07_2017-06-09_1.fsa",
"A08_RR20170609_FA_A08_2017-06-09_1.fsa", "A09_RR20170609_FA_A09_2017-06-09_1.fsa",
"A10_RR20170609_FA_A10_2017-06-09_1.fsa", "A11_RR20170609_FA_A11_2017-06-09_1.fsa",
"A12_RR20170609_FA_A12_2017-06-09_1.fsa"), class = "factor"),
ctrl = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), class = "factor", .Label = "A01_RR20170609_FA_A01_2017-06-09_1.fsa")), .Names = c("samples",
"ctrl"), row.names = c(NA, -12L), class = "data.frame")
> dput(MODs)
structure(list(samples = structure(1:12, .Label = c("A01_RR20170609_FA_A01_2017-06-09_1.fsa",
"A02_RR20170609_FA_A02_2017-06-09_1.fsa", "A03_RR20170609_FA_A03_2017-06-09_1.fsa",
"A04_RR20170609_FA_A04_2017-06-09_1.fsa", "A05_RR20170609_FA_A05_2017-06-09_1.fsa",
"A06_RR20170609_FA_A06_2017-06-09_1.fsa", "A07_RR20170609_FA_A07_2017-06-09_1.fsa",
"A08_RR20170609_FA_A08_2017-06-09_1.fsa", "A09_RR20170609_FA_A09_2017-06-09_1.fsa",
"A10_RR20170609_FA_A10_2017-06-09_1.fsa", "A11_RR20170609_FA_A11_2017-06-09_1.fsa",
"A12_RR20170609_FA_A12_2017-06-09_1.fsa"), class = "factor"),
MOD = c(20.67, 19.7633333333333, 16.7833333333333, 21.7433333333333,
16.79, 14.8666666666667, 15.7833333333333, 21.66, 16.3966666666667,
19.6966666666667, 19.6033333333333, 15.3566666666667)), .Names = c("samples",
"MOD"), row.names = c(NA, -12L), class = "data.frame")
Your approach looks quite complicated, so I tried to use base vectors for calculations.
1) Firstly, I crated function which takes vectors as arguments, because your supplied code with evalve was hard to understand.
myFun <- function(x, mod, cag, thres) {
x[x < (thres * max(x))] <- 0
norm_x <- x / sum(x)
sum_x <- norm_x * (cag - mod)
sum(sum_x)
}
I hope I got it right.
2) Then we create transition matrix, from which we will take MOD values.
transision_matrix <- merge(ctrls, MODs, by.x = "ctrl", by.y = "samples")
setDT(transision_matrix)
3) Then we can write function which takes column name and data.table`s as arguments to obtain your desired results:
mf2 <- function(colname, dataDT, transision_matrix){
x <- dataDT[, colname, with = F][[1]]
mod <- transision_matrix[samples == colname, MOD]
cag <- dataDT[, "CAG"][[1]]
myFun(x, mod, cag, thres = 0.2)
}
4) And lastly we need only to apply/supply it over the column name vector
sapply(colsToBeUsed, function(x) mf2(x, dataDT, transision_matrix))
A01 A02
-104.8000 -103.2286
UPDATE
It looks like the problem is in the names of samples column in transision_matrix, the names in it does not match the column names. You should change either column names of dataDT or values of samples column, that their format matches.
> 'A01_RR20170609_FA_A01_2017-06-09_1.fsa' == "A01_RR20170609_FA_A01_2017.06.09_1.fsa"
[1] FALSE
(change the dots)
You can do it like this:
> colnames(dataDT)
[1] "A01_RR20170609_FA_A01_2017-06-09_1.fsa" "CAG"
> colnames(dataDT) <- gsub(".","-", colnames(dataDT), fixed = T) #change all dots to -
> colnames(dataDT) <- gsub("-fsa",".fsa", colnames(dataDT), fixed = T) #change back the end of string
> colnames(dataDT)
[1] "A01_RR20170609_FA_A01_2017-06-09_1.fsa" "CAG"
Now everything should work.
Thanks. This is the function at the moment:
library(data.table)
dataDT <- height[,13:ncol(height)] #Create a data frame containing CAG and height columns
dataDT <- setDT(dataDT) #Convert to a data table
colsToBeUsed<-names(dataDT[,!'CAG']) #Assigns the columns to be analysed
myFun <- function(x, mod, cag, thres) { #Function that takes vectors as arguments.
x[x < (thres * max(x))] <- 0 #First sets all heights < 0.2*threshold to 0.
norm_x <- x / sum(x) #Then normalises heights by dividing by the sum of the heights.
sum_x <- norm_x * (cag - mod) #Then multiplies by the change in CAG from mode
sum(sum_x) #Then sums the results
}
transision_matrix <- merge(propsettings, modeHt, by.x = "control", by.y = "sample") #Transition matrix that determines control modes for each sample
setDT(transision_matrix)
mf2 <- function(colname, dataDT, transision_matrix){ #Function that takes column name and data table as arguments.
x <- dataDT[, colname, with = F][[1]]
mod <- transision_matrix[sample == colname, mode] #Vector of CONTROL modes to use
cag <- dataDT[, "CAG"][[1]]
thres <- resultsHt$iithreshold #THIS LIKELY SOURCE OF ERROR AS IT WORKS WITH 0.2
myFun(x, mod, cag, thres)
}
iiHt <- sapply(colsToBeUsed, function(x) mf2(x, dataDT, transision_matrix))
resultsHt$iiHt <- iiHt
This is 'resultsHt$iithreshold':
[1] 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2
[29] 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2
This is modeHt:
sample mode
1 A01_MF20170623_FA_A01_2017-06-23_1.fsa 130.93667
2 A02_MF20170623_FA_A02_2017-06-23_1.fsa 131.14667
3 A03_MF20170623_FA_A03_2017-06-23_1.fsa 132.07333
4 A05_MF20170623_FA_A05_2017-06-23_1.fsa 151.97333
5 A06_MF20170623_FA_A06_2017-06-23_1.fsa 128.39333
6 A07_MF20170623_FA_A07_2017-06-23_1.fsa 116.02333
7 A08_MF20170623_FA_A08_2017-06-23_1.fsa 127.42667
8 A09_MF20170623_FA_A09_2017-06-23_1.fsa 163.22000
9 A10_MF20170623_FA_A10_2017-06-23_1.fsa 131.92667
10 A11_MF20170623_FA_A11_2017-06-23_1.fsa 133.57333
11 A12_MF20170623_FA_A12_2017-06-23_1.fsa 164.85333
12 B01_MF20170623_FA_B01_2017-06-23_1.fsa 180.34333
13 B02_MF20170623_FA_B02_2017-06-23_1.fsa 133.08000
14 B03_MF20170623_FA_B03_2017-06-23_1.fsa 163.53333
15 B04_MF20170623_FA_B04_2017-06-23_1.fsa 133.13333
16 B05_MF20170623_FA_B05_2017-06-23_1.fsa 133.08000
17 B06_MF20170623_FA_B06_2017-06-23_1.fsa 167.23000
18 B07_MF20170623_FA_B07_2017-06-23_1.fsa 115.05667
19 B08_MF20170623_FA_B08_2017-06-23_1.fsa 179.62333
20 C01_MF20170623_FA_C01_2017-06-23_1.fsa 115.93000
21 C02_MF20170623_FA_C02_2017-06-23_1.fsa 115.17333
22 C05_MF20170623_FA_C05_2017-06-23_1.fsa 131.18667
23 C07_MF20170623_FA_C07_2017-06-23_1.fsa 131.13333
24 C08_MF20170623_FA_C08_2017-06-23_1.fsa 131.13000
25 C09_MF20170623_FA_C09_2017-06-23_1.fsa 130.09333
26 C10_MF20170623_FA_C10_2017-06-23_1.fsa 115.09000
27 C11_MF20170623_FA_C11_2017-06-23_1.fsa 130.04000
28 C12_MF20170623_FA_C12_2017-06-23_1.fsa 115.70000
29 D02_MF20170623_FA_D02_2017-06-23_1.fsa 116.03667
30 D03_MF20170623_FA_D03_2017-06-23_1.fsa 131.14000
31 D04_MF20170623_FA_D04_2017-06-23_1.fsa 115.22667
32 D05_MF20170623_FA_D05_2017-06-23_1.fsa 19.88000
33 D06_MF20170623_FA_D06_2017-06-23_1.fsa 19.91000
34 D08_MF20170623_FA_D08_2017-06-23_1.fsa 19.84667
35 D10_MF20170623_FA_D10_2017-06-23_1.fsa 72.32333
36 D11_MF20170623_FA_D11_2017-06-23_1.fsa 130.00333
37 D12_MF20170623_FA_D12_2017-06-23_1.fsa 130.01333
38 A01_MF20170522_FA_A01_2017-05-22_1.fsa 136.94667
39 C02_MF20170529_FA_C02_2017-05-30_1.fsa 132.31667
40 B08_MF20170522_FA_B08_2017-05-22_1.fsa 121.00000
This is 'propsettings':
sample control
1 A01_MF20170623_FA_A01_2017-06-23_1.fsa B08_MF20170522_FA_B08_2017-05-22_1.fsa
2 A02_MF20170623_FA_A02_2017-06-23_1.fsa B08_MF20170522_FA_B08_2017-05-22_1.fsa
3 A03_MF20170623_FA_A03_2017-06-23_1.fsa B08_MF20170522_FA_B08_2017-05-22_1.fsa
4 A05_MF20170623_FA_A05_2017-06-23_1.fsa B08_MF20170522_FA_B08_2017-05-22_1.fsa
5 A06_MF20170623_FA_A06_2017-06-23_1.fsa B08_MF20170522_FA_B08_2017-05-22_1.fsa
6 A07_MF20170623_FA_A07_2017-06-23_1.fsa B08_MF20170522_FA_B08_2017-05-22_1.fsa
7 A08_MF20170623_FA_A08_2017-06-23_1.fsa B08_MF20170522_FA_B08_2017-05-22_1.fsa
8 A09_MF20170623_FA_A09_2017-06-23_1.fsa B08_MF20170522_FA_B08_2017-05-22_1.fsa
9 A10_MF20170623_FA_A10_2017-06-23_1.fsa B08_MF20170522_FA_B08_2017-05-22_1.fsa
10 A11_MF20170623_FA_A11_2017-06-23_1.fsa B08_MF20170522_FA_B08_2017-05-22_1.fsa
11 A12_MF20170623_FA_A12_2017-06-23_1.fsa B08_MF20170522_FA_B08_2017-05-22_1.fsa
12 B01_MF20170623_FA_B01_2017-06-23_1.fsa B08_MF20170522_FA_B08_2017-05-22_1.fsa
13 B02_MF20170623_FA_B02_2017-06-23_1.fsa B08_MF20170522_FA_B08_2017-05-22_1.fsa
14 B03_MF20170623_FA_B03_2017-06-23_1.fsa B08_MF20170522_FA_B08_2017-05-22_1.fsa
15 B04_MF20170623_FA_B04_2017-06-23_1.fsa B08_MF20170522_FA_B08_2017-05-22_1.fsa
16 B05_MF20170623_FA_B05_2017-06-23_1.fsa B08_MF20170522_FA_B08_2017-05-22_1.fsa
17 B06_MF20170623_FA_B06_2017-06-23_1.fsa B08_MF20170522_FA_B08_2017-05-22_1.fsa
18 B07_MF20170623_FA_B07_2017-06-23_1.fsa B08_MF20170522_FA_B08_2017-05-22_1.fsa
19 B08_MF20170623_FA_B08_2017-06-23_1.fsa B08_MF20170522_FA_B08_2017-05-22_1.fsa
20 C01_MF20170623_FA_C01_2017-06-23_1.fsa B08_MF20170522_FA_B08_2017-05-22_1.fsa
21 C02_MF20170623_FA_C02_2017-06-23_1.fsa B08_MF20170522_FA_B08_2017-05-22_1.fsa
22 C05_MF20170623_FA_C05_2017-06-23_1.fsa B08_MF20170522_FA_B08_2017-05-22_1.fsa
23 C07_MF20170623_FA_C07_2017-06-23_1.fsa B08_MF20170522_FA_B08_2017-05-22_1.fsa
24 C08_MF20170623_FA_C08_2017-06-23_1.fsa B08_MF20170522_FA_B08_2017-05-22_1.fsa
25 C09_MF20170623_FA_C09_2017-06-23_1.fsa B08_MF20170522_FA_B08_2017-05-22_1.fsa
26 C10_MF20170623_FA_C10_2017-06-23_1.fsa B08_MF20170522_FA_B08_2017-05-22_1.fsa
27 C11_MF20170623_FA_C11_2017-06-23_1.fsa B08_MF20170522_FA_B08_2017-05-22_1.fsa
28 C12_MF20170623_FA_C12_2017-06-23_1.fsa B08_MF20170522_FA_B08_2017-05-22_1.fsa
29 D02_MF20170623_FA_D02_2017-06-23_1.fsa B08_MF20170522_FA_B08_2017-05-22_1.fsa
30 D03_MF20170623_FA_D03_2017-06-23_1.fsa B08_MF20170522_FA_B08_2017-05-22_1.fsa
31 D04_MF20170623_FA_D04_2017-06-23_1.fsa B08_MF20170522_FA_B08_2017-05-22_1.fsa
32 D05_MF20170623_FA_D05_2017-06-23_1.fsa B08_MF20170522_FA_B08_2017-05-22_1.fsa
33 D06_MF20170623_FA_D06_2017-06-23_1.fsa B08_MF20170522_FA_B08_2017-05-22_1.fsa
34 D08_MF20170623_FA_D08_2017-06-23_1.fsa B08_MF20170522_FA_B08_2017-05-22_1.fsa
35 D10_MF20170623_FA_D10_2017-06-23_1.fsa B08_MF20170522_FA_B08_2017-05-22_1.fsa
36 D11_MF20170623_FA_D11_2017-06-23_1.fsa B08_MF20170522_FA_B08_2017-05-22_1.fsa
37 D12_MF20170623_FA_D12_2017-06-23_1.fsa B08_MF20170522_FA_B08_2017-05-22_1.fsa
38 A01_MF20170522_FA_A01_2017-05-22_1.fsa B08_MF20170522_FA_B08_2017-05-22_1.fsa
39 C02_MF20170529_FA_C02_2017-05-30_1.fsa B08_MF20170522_FA_B08_2017-05-22_1.fsa
40 B08_MF20170522_FA_B08_2017-05-22_1.fsa B08_MF20170522_FA_B08_2017-05-22_1.fsa
This is a summary of what 'height' looks like:
> str(height)
'data.frame': 1078 obs. of 53 variables:
$ Dye/SamplePeak : chr "B,65" "B,66" "B,67" "B,68" ...
$ Marker : chr NA NA NA NA ...
$ Allele : chr NA NA NA NA ...
$ Size : num 418 432 435 438 441 ...
$ Area : num 285 354 300 334 342 385 359 370 410 439 ...
$ DataPoint : num 5665 5827 5859 5890 5924 ...
$ flank : num 108 108 108 108 108 108 108 108 108 108 ...
$ correction : num 2 2 2 2 2 2 2 2 2 2 ...
$ start : num 100 100 100 100 100 100 100 100 100 100 ...
$ end : num 200 200 200 200 200 200 200 200 200 200 ...
$ control : chr "B08_MF20170522_FA_B08_2017-05-22_1.fsa" "B08_MF20170522_FA_B08_2017-05-22_1.fsa" "B08_MF20170522_FA_B08_2017-05-22_1.fsa" "B08_MF20170522_FA_B08_2017-05-22_1.fsa" ...
$ iithreshold : num 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 ...
$ CAG : num 105 110 111 112 113 ...
$ A01_MF20170623_FA_A01_2017-06-23_1.fsa: num 31 32 32 33 40 37 36 41 45 38 ...
$ A02_MF20170623_FA_A02_2017-06-23_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ A03_MF20170623_FA_A03_2017-06-23_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ A05_MF20170623_FA_A05_2017-06-23_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ A06_MF20170623_FA_A06_2017-06-23_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ A07_MF20170623_FA_A07_2017-06-23_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ A08_MF20170623_FA_A08_2017-06-23_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ A09_MF20170623_FA_A09_2017-06-23_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ A10_MF20170623_FA_A10_2017-06-23_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ A11_MF20170623_FA_A11_2017-06-23_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ A12_MF20170623_FA_A12_2017-06-23_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ B01_MF20170623_FA_B01_2017-06-23_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ B02_MF20170623_FA_B02_2017-06-23_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ B03_MF20170623_FA_B03_2017-06-23_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ B04_MF20170623_FA_B04_2017-06-23_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ B05_MF20170623_FA_B05_2017-06-23_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ B06_MF20170623_FA_B06_2017-06-23_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ B07_MF20170623_FA_B07_2017-06-23_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ B08_MF20170623_FA_B08_2017-06-23_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ C01_MF20170623_FA_C01_2017-06-23_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ C02_MF20170623_FA_C02_2017-06-23_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ C05_MF20170623_FA_C05_2017-06-23_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ C07_MF20170623_FA_C07_2017-06-23_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ C08_MF20170623_FA_C08_2017-06-23_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ C09_MF20170623_FA_C09_2017-06-23_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ C10_MF20170623_FA_C10_2017-06-23_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ C11_MF20170623_FA_C11_2017-06-23_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ C12_MF20170623_FA_C12_2017-06-23_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ D02_MF20170623_FA_D02_2017-06-23_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ D03_MF20170623_FA_D03_2017-06-23_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ D04_MF20170623_FA_D04_2017-06-23_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ D05_MF20170623_FA_D05_2017-06-23_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ D06_MF20170623_FA_D06_2017-06-23_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ D08_MF20170623_FA_D08_2017-06-23_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ D10_MF20170623_FA_D10_2017-06-23_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ D11_MF20170623_FA_D11_2017-06-23_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ D12_MF20170623_FA_D12_2017-06-23_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ A01_MF20170522_FA_A01_2017-05-22_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ C02_MF20170529_FA_C02_2017-05-30_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
$ B08_MF20170522_FA_B08_2017-05-22_1.fsa: num 0 0 0 0 0 0 0 0 0 0 ...
I have a dataframe containing the daily rainfall values at 76 stations from 1964-2013. Each row is a different month for a particular station. Here is a snippet of the dataframe-
Station Year Month Days 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
USC00020750 1964 1 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 25 0 23 51 36 0 0 0 0 0 0 0 0
USC00020750 1964 2 29 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 48 0 0 0 3 0 0 0 0 0 0 Inf Inf
USC00020750 1964 3 31 0 46 51 0 0 36 41 46 0 0 0 0 43 0 0 0 0 0 0 0 0 53 99 140 36 0 0 0 0 0 0
USC00020750 1964 4 30 5 69 23 30 0 18 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 33 13 0 0 0 15 0 Inf
USC00020750 1964 5 31 0 0 0 0 0 0 43 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 51 8 0 0 0 0
USC00020750 1964 6 30 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 38 0 0 0 Inf
USC00020750 1964 7 31 0 0 0 0 0 0 0 0 0 0 0 0 41 0 13 13 0 0 0 0 8 51 0 71 0 10 0 0 20 165 25
USC00020750 1964 8 31 8 30 137 0 0 5 89 0 0 0 18 64 5 0 0 0 0 0 0 0 0 0 0 0 0 76 0 0 0 0 0
USC00020750 1964 9 30 0 0 0 0 0 119 0 0 0 0 0 0 0 41 25 0 0 0 0 0 25 0 0 0 0 0 0 0 0 0 Inf
USC00020750 1964 10 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
USC00020750 1964 11 30 0 5 0 0 0 0 0 0 0 0 91 0 0 0 36 94 0 0 0 0 0 0 0 0 0 0 0 0 0 0 Inf
USC00020750 1964 12 31 0 107 20 0 0 0 0 0 0 0 0 0 0 0 0 0 0 79 152 0 0 0 0 0 0 0 0 0 0 0 0
...
Station Year Month Days 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
USW00093129 2013 10 31 0 0 0 0 0 0 0 0 43 15 0 0 10 0 0 0 0 0 0 0 0 0 0 0 0 0 0 41 3 8 0
USW00093129 2013 11 30 0 0 0 23 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 79 18 20 0 0 0 0 0 0 0 Inf
USW00093129 2013 12 31 0 0 175 33 0 0 3 0 0 0 0 0 0 0 0 0 0 0 5 15 0 0 0 0 0 0 0 0 0 0 0
I am trying to find the length of the longest stretch of non-zero rainfall values for each row and the total rainfall in that stretch. The easiest way to find the length of the longest stretch would be to convert the dataframe to 0s and 1s, use rle and apply max(y$lengths[y$values!=0]) along each row. But how do I find the sum of the values?
Thanks for helping out, in advance!
Not exactly a one-liner, but this works :
df <- read.table(header=TRUE,stringsAsFactors=FALSE,check.names=FALSE,text=
"Station Year Month Days 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
USC00020750 1964 1 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 25 0 23 51 36 0 0 0 0 0 0 0 0
USC00020750 1964 2 29 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 48 0 0 0 3 0 0 0 0 0 0 Inf Inf
USC00020750 1964 3 31 0 46 51 0 0 36 41 46 0 0 0 0 43 0 0 0 0 0 0 0 0 53 99 140 36 0 0 0 0 0 0
USC00020750 1964 4 30 5 69 23 30 0 18 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 33 13 0 0 0 15 0 Inf
USC00020750 1964 5 31 0 0 0 0 0 0 43 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 51 8 0 0 0 0
USC00020750 1964 6 30 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 38 0 0 0 Inf
USC00020750 1964 7 31 0 0 0 0 0 0 0 0 0 0 0 0 41 0 13 13 0 0 0 0 8 51 0 71 0 10 0 0 20 165 25
USC00020750 1964 8 31 8 30 137 0 0 5 89 0 0 0 18 64 5 0 0 0 0 0 0 0 0 0 0 0 0 76 0 0 0 0 0
USC00020750 1964 9 30 0 0 0 0 0 119 0 0 0 0 0 0 0 41 25 0 0 0 0 0 25 0 0 0 0 0 0 0 0 0 Inf
USC00020750 1964 10 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
USC00020750 1964 11 30 0 5 0 0 0 0 0 0 0 0 91 0 0 0 36 94 0 0 0 0 0 0 0 0 0 0 0 0 0 0 Inf
USC00020750 1964 12 31 0 107 20 0 0 0 0 0 0 0 0 0 0 0 0 0 0 79 152 0 0 0 0 0 0 0 0 0 0 0 0")
res <- lapply(1:nrow(df), function(r){
monthDays <- df[r,'Days']
rain <- as.numeric(df[r,(1:monthDays) + 4])
enc <- rle(rain > 0)
if(all(!enc$values))
return(c(0,0))
len <- enc$lengths
len[!enc$values] <- 0
max.idx <- which.max(len)
lastIdx <- cumsum(enc$lengths)[max.idx]
firstIdx <- lastIdx - enc$lengths[max.idx] + 1
tot <- sum(rain[firstIdx:lastIdx])
stretch <- lastIdx - firstIdx + 1
return(c(stretch,tot))
})
columnsToAdd <- do.call(rbind,res)
colnames(columnsToAdd) <- c('StretchLen','StretchRain')
df2 <- cbind(df,columnsToAdd)
Result :
# We print the result without months values for better readability
> df2[,-(5:35)]
Station Year Month Days StretchLen StretchRain
1 USC00020750 1964 1 31 3 110
2 USC00020750 1964 2 29 1 48
3 USC00020750 1964 3 31 4 328
4 USC00020750 1964 4 30 4 127
5 USC00020750 1964 5 31 2 59
6 USC00020750 1964 6 30 1 38
7 USC00020750 1964 7 31 3 210
8 USC00020750 1964 8 31 3 175
9 USC00020750 1964 9 30 2 66
10 USC00020750 1964 10 31 0 0
11 USC00020750 1964 11 30 2 130
12 USC00020750 1964 12 31 2 127
BTW, if you want to stick with apply, it would be like this :
columnsToAdd <-
t(apply(df[,-(1:3)],MARGIN=1,function(r){
monthDays <- r[1]
rain <- as.numeric(r[-1])
enc <- rle(rain > 0)
if(all(!enc$values))
return(c(0,0))
len <- enc$lengths
len[!enc$values] <- 0
max.idx <- which.max(len)
lastIdx <- cumsum(enc$lengths)[max.idx]
firstIdx <- lastIdx - enc$lengths[max.idx] + 1
tot <- sum(rain[firstIdx:lastIdx])
stretch <- lastIdx - firstIdx + 1
return(c(stretch,tot))
}))
colnames(columnsToAdd) <- c('StretchLen','StretchRain')
df2 <- cbind(df,columnsToAdd)
I don't like using apply on data.frame's since it has been created for matrices and so it coerces the columns to the same type before calling the function (hence if you work on columns of different types you need to be careful).
Here's another solution with dplyr/tidyr
data %>%
gather(day, rain, -Station, -Year, -Month, -Days) %>%
arrange(Station, Year, Month, day) %>%
group_by(Station, Year, Month) %>%
mutate(previous_rain = lag(rain)) %>%
filter(!(rain %in% c(0, Inf))) %>%
mutate(storm = cumsum(previous_rain %in% c(0, NA))) %>%
group_by(Station, Year, Month, storm) %>%
summarize(total_rain = sum(rain),
number_of_days = n(),
start_day = first(day),
end_day = last(day)) %>%
arrange(desc(number_of_days)) %>%
slice(1)
Here's another take at it, where I've used the rle() function to find run lengths. It's protracted but primarily to make it clear what is happening - you could shorten it easily.
raindf <-
tmp <- read.table(textConnection(" Station Year Month Days 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
USC00020750 1964 1 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 25 0 23 51 36 0 0 0 0 0 0 0 0
USC00020750 1964 2 29 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 48 0 0 0 3 0 0 0 0 0 0 Inf Inf
USC00020750 1964 3 31 0 46 51 0 0 36 41 46 0 0 0 0 43 0 0 0 0 0 0 0 0 53 99 140 36 0 0 0 0 0 0
USC00020750 1964 4 30 5 69 23 30 0 18 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 33 13 0 0 0 15 0 Inf
USC00020750 1964 5 31 0 0 0 0 0 0 43 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 51 8 0 0 0 0
USC00020750 1964 6 30 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 38 0 0 0 Inf
USC00020750 1964 7 31 0 0 0 0 0 0 0 0 0 0 0 0 41 0 13 13 0 0 0 0 8 51 0 71 0 10 0 0 20 165 25
USC00020750 1964 8 31 8 30 137 0 0 5 89 0 0 0 18 64 5 0 0 0 0 0 0 0 0 0 0 0 0 76 0 0 0 0 0
USC00020750 1964 9 30 0 0 0 0 0 119 0 0 0 0 0 0 0 41 25 0 0 0 0 0 25 0 0 0 0 0 0 0 0 0 Inf
USC00020750 1964 10 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
USC00020750 1964 11 30 0 5 0 0 0 0 0 0 0 0 91 0 0 0 36 94 0 0 0 0 0 0 0 0 0 0 0 0 0 0 Inf
USC00020750 1964 12 31 0 107 20 0 0 0 0 0 0 0 0 0 0 0 0 0 0 79 152 0 0 0 0 0 0 0 0 0 0 0 0"), header = TRUE)
rainfall <- unlist(as.data.frame(t(raindf[1:3, -c(1:4)])), use.names = FALSE)
rainfall <- rainfall[!is.infinite(rainfall)]
rainfall[rainfall > 0] <- 1
rainyruns <- rle(rainfall)
rainyrunsDf <- data.frame(lengths = rainyruns$lengths, values = rainyruns$values)
rainyrunsDf <- subset(rainyrunsDf, values != 0)
rainyrunsDf <- rainyrunsDf[order(rainyrunsDf$lengths, decreasing = TRUE), ]
rainyrunsDf[1,1]
## [1] 4