identifying miss classified values in confusion matrix in R - r

I am using the caret package along with the confusionMatrix function and I would like to know if it is possible to know which are the exact values that were not clasified properly.
Here is a subset of my train data
train_sub <- structure(
list(
corr = c(
0.629922866893549,
0.632354159559817,
0.656112138936032,
0.4469719807955,
0.598136079870775,
0.314461239093862,
0.379065842199838,
0.347331370037428,
0.310270891798492,
0.361064451331448,
0.335628455451358
),
rdist = c(
0.775733824285612,
0.834148208687529,
0.884167982488944,
0.633989717138057,
0.850225777237626,
0.626197919283803,
0.649597055761598,
0.680382136363523,
0.627828985862852,
0.713674404108905,
0.646094473468118
),
CCF2 = c(
0.634465565134314,
0.722096802135009,
0.792385621105087,
0.46497582143802,
0.739612023831014,
0.470724554509749,
0.505961260826622,
0.527876803999064,
0.461724328071479,
0.564117580569802,
0.490084457081904
),
Wcorr = c(
0.629,
0.613,
0.812,
0.424,
0.593,
0.36,
0.346,
0.286,
0.333,
0.381,
0.333
),
Wcorr2 = c(
0.735,
0.743,
0.802,
0.588,
0.691,
0.632,
0.61,
0.599,
0.599,
0.632,
0.613
),
Wcorr3 = c(
0.21,
0.301,
0.421,
-0.052,
0.169,
-0.032,
-0.042,-0.048,
-0.035,
0.006,
-0.004
),
Var = c("W", "W", "W", "W",
"W", "B", "B", "B", "B", "B", "B")
),
row.names = c(1L, 2L,
3L, 5L, 7L, 214L, 215L, 216L, 217L, 218L, 221L),
class = "data.frame"
)
and here is a subset of my test data
test_sub <- structure(
list(
corr = c(
0.636658204667785,
0.5637857758104,
0.540558984461647,
0.392647603023863,
0.561801911406989,
0.297187412065481,
0.278864501603015,
0.505277007007347,
0.403811785308709,
0.510158398354856,
0.459607853624603
),
rdist = c(
0.887270722679019,
0.843656768956754,
0.815806338767273,
0.732093571145576,
0.832944903081762,
0.485497073465096,
0.454461718498521,
0.69094669881886,
0.627667080657035,
0.705558894672344,
0.620838398507191
),
CCF2 = c(
0.802017782695131,
0.731763898271157,
0.689402284804853,
0.577932997250877,
0.715111899030751,
0.324826043263382,
0.298456267077388,
0.544808216945995,
0.458148923874818,
0.551160266327893,
0.461228649848996
),
Wcorr = c(
0.655,
0.536,
0.677,
0.556,
0.571,
0.29,
0.25,
0.484,
0.25,
0.515,
0.314
),
Wcorr2 = c(
0.779,
0.682,
0.734,
0.675,
0.736,
0.5,
0.529,
0.611,
0.555,
0.639,
0.572
),
Wcorr3 = c(
0.368,
0.154,
0.266,
0.103,
0.224,
-0.204,
-0.16,
-0.026,
-0.149,
0.032,
-0.097
),
Var = c("W", "W", "W", "W", "W", "B", "B", "B", "B", "B",
"B")
),
row.names = c(4L, 6L, 8L, 13L, 15L, 321L, 322L, 329L,
334L, 341L, 344L),
class = "data.frame"
)
When I use this line,
confusionMatrix(reference=as.factor(test$Var),data=fittedTL,mode = "everything")
With this I compute some machine learning using glmnet method (it gives the best accuracy ini my case)
classCtrl <- trainControl(method = "repeatedcv", number=10,repeats=5,classProbs = TRUE,savePredictions = "final")
set.seed(355)
glmnetTL <- train(Var~., train_sub, method= "glmnet", trControl=classCtrl)
glmnetTL
And finally I compute the confusion matrix on my test set:
predict_glmnet <- predict(glmnetTL,test_sub)
predict_glmnet
CM_glmnet <- confusionMatrix(reference=as.factor(test_sub$Var),data=predict_glmnet,mode = "everything")
CM_glmnet
The output of the confusion matrix is a table like so
B
W
B
4
0
W
2
5
So here I have two predictions/classifications that are not good.
Is there any way I can traceback to which row of my test set it corresponds ?

Related

Find the 3 nearest neighbours (dist()?) and calculate mean in new column

This is a sample of the data
structure(list(Season = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("2018/2019",
"2019/2020"), class = "factor"), Date2 = structure(c(17860, 17888,
17916, 17940, 17945, 17952, 17953, 17954, 17978, 17999, 18005,
18188, 18209, 18223, 18237, 18320, 18322, 18334, 18447, 18476
), class = "Date"), HT.av.points = c(0.57, 1.5, 1.67, 1.8, 1.09,
2.18, 1.42, 1.45, 1.79, 1.35, 1.14, 1.83, 2, 1.17, 1.88, 1.83,
1.33, 0.92, 1.31, 1.06), AT.av.points = c(1.14, 2.33, 0.56, 1.2,
1.09, 1.6, 1.08, 1.9, 1.17, 0.9, 1.38, 0.67, 2.14, 1.33, 0.62,
1.08, 2.17, 1.38, 0.56, 0.94), HT_av.PointsTotal = c(0.86, 1.16,
1.18, 1.23, 0.86, 1.86, 1.2, 1.18, 1.5, 1.1, 1.07, 1.46, 1.6,
1.08, 1.75, 1.4, 1.16, 0.92, 1.03, 0.97), AT_av.PointsTotal = c(2.07,
2.21, 0.76, 1.42, 1.59, 1.5, 1.2, 1.91, 1.65, 1.43, 1.38, 0.54,
1.87, 1.58, 0.8, 1.6, 2.32, 1.42, 1.12, 1.32), DIFF.AV.POINTS.PREDICTION = c(-0.28,
-0.43, 0.51, 0.52, -0.36, 0.56, 0.28, -0.38, -0.2, 0.03, -0.43,
1.24, -0.32, -0.29, 1.44, 0.28, -0.85, -0.38, 1.01, 0.22), Over2.5G = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1)), row.names = c(NA,
-20L), .internal.selfref = <pointer: 0x1ca2448>, class = c("data.table",
"data.frame"), .Names = c("Season", "Date2", "HT.av.points",
"AT.av.points", "HT_av.PointsTotal", "AT_av.PointsTotal", "DIFF.AV.POINTS.PREDICTION",
"Over2.5G"))
What I want to do:
group by Season
After the group by, I want to find the 3 previous rows that are most similar (according to the following columns) to the current row.
(HT.av.points, AT.av.points, HT_av.PointsTotal, AT_av.PointsTotal, DIFF.AV.POINTS.PREDICTION)
I guess the dist() function is a possibility.
Finally I want to create a new column with the mean of the values of the Over2.5G column of those 3 most similar rows.
New column:
First 3 rows(of the Season) NAs.
In fourth row(of the Season) the 3 nearest neighbours (and their Over2.5G values) will always be the first 3 rows.
breaking below code up:
a helper function which returns row indices of nearest neighbours with a ready-made function, e.g. get.knn of package FNN
calling this function for increasingly large slices (from row one to current) of the input data df and storing the result as an extra column
extracting the row indices as integers from the result string to index the desired column of the input data for the aggregation (mean, in your case)
here we go:
## helper function returns row indices of nearest 3 neighbours
## as comma-separated string
find_nearest_predecessors <- function(df, ...){
ifelse(nrow(df) < 4, ## can't calculate n neighbours for <n rows:
paste(1:3, collapse = ','),
## otherwise = if sufficient rows,
## get row indices of 3 nearest neighbours:
get.knn(data = df,
k = 3,
algo = 'CR'
) %>%
.[['nn.index']] %>%
tail(1) %>% paste(collapse = ',')
)
}
## df being your input data:
df %>%
mutate(rownum = row_number()) %>%
rowwise %>%
mutate(nearest_neighbours = find_nearest_predecessors(
df = ## use previous data up to current row:
slice(df, 1:rownum) %>%
## choose features/dimensions of distance:
select(HT.av.points, AT.av.points, HT_av.PointsTotal,
AT_av.PointsTotal, DIFF.AV.POINTS.PREDICTION)
),
## calculate mean of OVER2.5G
mean_Over2.5G = mean(df$Over2.5G[
strsplit(nearest_neighbours,',') %>%
unlist %>% as.integer
], na.rm = TRUE)
)

Coefficient of variances by column to each dataframe in a list of dataframes

I have written a function to calculate Coefficient of variances that I want to a apply to a list of dataframes. The function executes however returns an unexpected outcome. Instead of returning the result for each column of the each dataframe it is also producing additional values that do not make sense.
A sample of my list of dataframes below
list(Fe = structure(list(Determination_No = 1:6, `2` = c(NA,
NA, NA, NA, NA, NA), `3` = c(56.83, 56.54, 56.18, 56.5, 56.51,
56.34), `4` = c(56.39, 56.43, 56.53, 56.31, 56.47, 56.35), `5` = c(56.32,
56.29, 56.31, 56.32, 56.39, 56.32), `7` = c(56.48, 56.4, 56.54,
56.43, 56.73, 56.62), `8` = c(56.382, 56.258, 56.442, 56.258,
56.532, 56.264), `10` = c(56.3, 56.5, 56.2, 56.5, 56.7, 56.5),
`12` = c(56.11, 56.46, 56.1, 56.35, 56.36, 56.37)), row.names = c(NA,
-6L), class = "data.frame"), SiO2 = structure(list(Determination_No = 1:6,
`2` = c(7.63, 7.65, 7.73, 7.67, 7.67, 7.67), `3` = c(7.84,
7.69, 7.59, 7.77, 7.74, 7.64), `4` = c(7.67, 7.74, 7.62,
7.81, 7.66, 7.8), `5` = c(7.91, 7.84, 7.96, 7.87, 7.84, 7.92
), `7` = c(7.77, 7.83, 7.76, 7.78, 7.65, 7.74), `8` = c(7.936,
7.685, 7.863, 7.838, 7.828, 7.767), `10` = c(7.872684992,
7.851291827, 7.872684992, 7.722932832, 7.680146501, 7.615967003
), `12` = c(7.64, 7.71, 7.71, 7.65, 7.82, 7.68)), row.names = c(NA,
-6L), class = "data.frame"), Al2O3 = structure(list(Determination_No = 1:6,
`2` = c(2.01, 2.02, 2.03, 2.01, 2.02, 2), `3` = c(2.01, 2.01,
2, 2.02, 2.02, 2.03), `4` = c(2, 2.03, 1.99, 2.01, 2.01,
2.01), `5` = c(2.02, 2.02, 2.05, 2.03, 2.02, 2.03), `7` = c(NA,
NA, NA, NA, NA, NA), `8` = c(2.053, 2.044, 2.041, 2.038,
2.008, 2.02), `10` = c(2.002830415, 2.021725042, 2.021725042,
1.983935789, 2.002830415, 2.021725042), `12` = c(NA, NA,
NA, NA, NA, NA)), row.names = c(NA, -6L), class = "data.frame"),
TiO2 = structure(list(Determination_No = 1:6, `2` = c(0.07,
0.07, 0.07, 0.07, 0.07, 0.07), `3` = c(NA, NA, NA, NA, NA,
NA), `4` = c(0.07, 0.07, 0.07, 0.07, 0.07, 0.07), `5` = c(0.07,
0.07, 0.07, 0.07, 0.07, 0.07), `7` = c(NA, NA, NA, NA, NA,
NA), `8` = c(NA, NA, NA, NA, NA, NA), `10` = c(0.066721378,
0.066721378, 0.066721378, 0.066721378, 0.066721378, 0.066721378
), `12` = c(NA, NA, NA, NA, NA, NA)), row.names = c(NA, -6L
), class = "data.frame"), Mn = structure(list(Determination_No = 1:6,
`2` = c(0.194, 0.209, 0.218, 0.22, 0.213, 0.217), `3` = c(0.222,
0.214, 0.21, 0.212, 0.205, 0.213), `4` = c(0.21, 0.21,
0.21, 0.22, 0.23, 0.2), `5` = c(0.23, 0.21, 0.22, 0.21,
0.2, 0.22), `7` = c(0.197, 0.238, 0.205, 0.223, 0.205,
0.214), `8` = c(0.217, 0.221, 0.237, 0.213, 0.227, 0.232
), `10` = c(0.21, 0.21, 0.22, 0.23, 0.21, 0.22), `12` = c(NA,
0.24, 0.23, 0.23, 0.22, 0.23)), row.names = c(NA, -6L
), class = "data.frame"), CaO = structure(list(Determination_No = 1:6,
`2` = c(0.08, 0.07, 0.07, 0.07, 0.08, 0.07), `3` = c(0.08,
0.07, 0.07, 0.07, 0.07, 0.07), `4` = c(NA, NA, NA, NA,
NA, NA), `5` = c(0.08, 0.07, 0.08, 0.07, 0.07, 0.07),
`7` = c(NA, NA, NA, NA, NA, NA), `8` = c(0.07, 0.071,
0.07, 0.067, 0.071, 0.07), `10` = c(0.069959326, 0.069959326,
0.069959326, 0.069959326, 0.069959326, 0.069959326),
`12` = c(NA, NA, NA, NA, NA, NA)), row.names = c(NA,
-6L), class = "data.frame"))
function below
labCV <- function(x,...){
LabMean <- round(mapply(mean, x[-1], na.rm = T),digits = 2)
Lab.GrandMean <- median(LabMean,na.rm=T)
lab.SD <- round(mapply(sd, x[-1], na.rm = T), digits = 2)
SD.All <- unlist(x[-1]) #convert all the values to a vector
lab.cv <- as.vector(lab.SD/LabMean) *100
lab.cvall <- ((SD.All / Lab.GrandMean) * 100)
lab.cv.T <- format(round(lab.cv,2),nsmall = 2)
lab.cvall.T <- format(round(lab.cvall,2),nsmall =2)
CV.Summary <- c("Coeff. Variation", lab.cv.T, lab.cvall.T)
return(CV.Summary)
}
df.cv <- lapply(df, function(x) labCV(x,na.rm=T))
I only expect a result for each lab in each dataframe however I am getting
c("Coeff. Variation", " NA", "0.39", "0.14", "0.05", "0.21",
"0.21", "0.32", "0.27", `21` = " NA", `22` = " NA", `23` = " NA",
`24` = " NA", `25` = " NA", `26` = " NA", `31` = "100.74",
`32` = "100.23", `33` = " 99.59", `34` = "100.16", `35` = "100.18",
`36` = " 99.88", `41` = " 99.96", `42` = "100.04", `43` = "100.21",
`44` = " 99.82", `45` = "100.11", `46` = " 99.89", `51` = " 99.84",
`52` = " 99.79", `53` = " 99.82", `54` = " 99.84", `55` = " 99.96",
`56` = " 99.84", `71` = "100.12", `72` = " 99.98", `73` = "100.23",
`74` = "100.04", `75` = "100.57", `76` = "100.37", `81` = " 99.95",
`82` = " 99.73", `83` = "100.06", `84` = " 99.73", `85` = "100.22",
`86` = " 99.74", `101` = " 99.80", `102` = "100.16", `103` = " 99.63",
`104` = "100.16", `105` = "100.51", `106` = "100.16", `121` = " 99.47",
`122` = "100.09", `123` = " 99.45", `124` = " 99.89", `125` = " 99.91",
`126` = " 99.93")
I didn't expect anything after 9 rows/entries. Not sure where I have gone wrong.
The following got the desired result
labCV <- function(x,...){
lab.cv <- mapply(sd, x[-1], na.rm = T)/mapply(mean, x[-1], na.rm = T) *100
LabCV.all <- round(sd(unlist(x[-1]), na.rm = T), digits = 4)/mean(mapply(mean, x[-1], na.rm = T),na.rm=T) *100
cv.summmary <- c(lab.cv,LabCV.all)
return(cv.summmary)
}
Maybe you only need lab.cv.T in the output.
labCV <- function(x,...){
LabMean <- round(mapply(mean, x[-1], na.rm = T),digits = 2)
#...
#...
CV.Summary <- c("Coeff. Variation", lab.cv.T)
return(CV.Summary)
}

Averaging the replicate data in omics / biostatistics

I have a dataframe for gene expression data. Samples are named as Genotype_Time_Replicate (e.g. AOX_1h_4).
E.g. data set
df <- structure(list(ID = c("AT5G54740.1", "AT5G55730.2", "AT5G57655.2", "AT5G64100.1", "AT5G64260.1", "AT5G67360.1", "AT1G30630.1", "AT1G62380.1", "AT1G70830.1", "AT3G14990.1", "AT4G18800.1", "AT4G24510.1", "AT5G15650.1", "AT5G19820.1", "AT5G59840.1", "AT5G47200.1", "AT1G12840.1", "AT1G76030.1", "AT1G78900.2", "AT3G42050.1", "AT4G11150.1", "AT1G11860.2", "AT1G17290.1" ),
Location = c("extracellular", "extracellular", "extracellular", "extracellular", "extracellular", "extracellular", "golgi", "golgi", "golgi", "golgi", "golgi", "golgi", "golgi", "golgi", "golgi", "ER", "ER", "ER", "mitochondrion", "mitochondrion", "mitochondrion", "mitochondrion", "mitochondrion"),
AOX_1h_1 = c(0.844651873, 0.50954096, 1.12e-08, 0.012981372, 0.978148381, 0.027579578, 0.068010151, 0.410629215, 0.253838635, 0.033631788, 0.335713512, 0.982799013, 0.025910457, 0.793810264, 0.762431665, 0.152154436, 0.027114103, 0.000227, 1.07e-05, 0.721209032, 0.086281162, 0.483130711, 0.014795515),
AOX_1h_2 = c(0.894623378, 0.011521413, 1.62e-06, 0.085249729, 0.02863972, 0.956962154, 0.225208718, 0.932679767, 0.002574192, 0.071700671, 0.233682544, 0.936572874, 1.12e-05, 0.241658735, 0.865205515, 0.000537, 0.103471292, 8.66e-07, 1.22e-08, 0.950878446, 0.145012176, 0.092919172, 0.599713247),
AOX_1h_3 = c(0.880951025, 0.00145276, 8.59e-10, 0.087023475, 0.675527672, 0.765543306, 0.305860948, 0.899172011, 0.020973476, 0.542988545, 0.735571562, 0.157569324, 0.025488075, 0.071006507, 0.262324019, 0.080470612, 0.0436526, 6.65e-09, 5.63e-10, 0.020557091, 0.069577215, 0.005502212, 0.852099232),
AOX_1h_4 = c(0.980823252, 0.158123518, 0.00210702, 0.006317657, 0.30496173, 0.489709702, 0.091469807, 0.958443361, 0.015583593, 0.566165972, 0.66746161, 0.935102341, 0.087733288, 0.744313619, 0.021169383, 0.633250945, 0.257489406, 0.024345088, 0.000355, 0.226279179, 0.004038493, 0.479275204, 0.703522761),
AOX_2h_1 = c(0.006474022, 0.246530998, 5.38e-06, 0.47169153, 0.305973663, 0.466202566, 0.191733645, 0.016121487, 0.234839116, 0.043866023, 0.089819656, 0.107934599, 2.09e-06, 0.413229678, 0.464078018, 0.004118766, 0.774970986, 3.79e-07, 2.3e-10, 0.428591262, 0.002326292, 0.385580707, 0.106216066),
AOX_2h_2 = c(0.166169729, 0.005721199, 7.77e-08, 0.099146712, 0.457164663, 0.481987525, 7.4e-05, 0.969805081, 0.100894997, 0.062103337, 0.095718425, 0.001686206, 0.009710516, 0.134651787, 0.887036569, 0.459218152, 0.074576369, 3.88e-09, 3.31e-15, 0.409645805, 0.064874307, 0.346371524, 0.449444779),
AOX_2h_3 = c(1.06e-05, 0.576589898, 4.03e-08, 0.787468189, 0.971119601, 0.432593753, 0.000274, 0.86932399, 0.08657663, 4.22e-06, 0.071190008, 0.697384316, 0.161623604, 0.422628778, 0.299545652, 0.767867006, 0.00295567, 0.078724176, 4.33e-09, 0.988576028, 0.080278831, 0.66505527, 0.014158693),
AOX_2h_4 = c(0.010356719, 0.026506539, 9.48e-09, 0.91009296, 0.302464488, 0.894377768, 0.742233323, 0.75032613, 0.175841127, 0.000721, 0.356904918, 0.461234653, 1.08e-05, 0.65800831, 0.360085919, 0.004814238, 0.174670947, 0.004246734, 7.31e-11, 0.778725214, 0.051334623, 0.10212841, 0.155831664 ),
AOX_6h_1 = c(0.271681878, 0.004822226, 1.87e-11, 0.616969208, 0.158860224, 0.684690326, 0.011798791, 0.564591916, 0.000314, 4.79e-06, 0.299871385, 0.001909713, 0.00682428, 0.039107415, 0.574143284, 0.061532691, 0.050483892, 2.28e-08, 1.92e-12, 0.058747794, 0.027147473, 0.196608218, 0.513693112),
AOX_6h_2 = c(5.72e-12, 0.719814288, 0.140016259, 0.927094438, 0.841229414, 0.224510089, 0.026567282, 0.242981965, 0.459311076, 0.038295888, 0.127935565, 0.453746728, 0.005023732, 0.554532387, 0.280899096, 0.336458018, 0.002024021, 0.793915731, 0.012838565, 0.873716549, 0.10097853, 0.237426815, 0.003711539),
AOX_6h_3 = c(3.16e-12, 0.780424491, 0.031315419, 0.363891436, 0.09562579, 0.104833988, 3.52e-05, 0.104196756, 0.870952423, 0.002036134, 0.016480622, 0.671475063, 2.3e-05, 0.00256744, 0.66263641, 0.005026601, 0.57280276, 0.058724117, 6.4e-10, 0.030965264, 0.005301006, 0.622027012, 0.371659724),
AOX_6h_4 = c(7.99e-10, 0.290847169, 0.001319424, 0.347344795, 0.743846306, 0.470908425, 0.00033, 0.016149973, 0.080036584, 0.020899676, 0.00723071, 0.187288769, 0.042514886, 0.00150443, 0.059344154, 0.06554177, 0.112601764, 0.000379, 2.36e-10, 0.78131093, 0.105861995, 0.174370801, 0.05570041 ),
WT_1h_1 = c(0.857, 0.809, 2.31e-05, 0.286, 0.87, 0.396, 0.539, 0.787, 0.73, 0.427, 0.764, 0.87, 0.386, 0.852, 0.848, 0.661, 0.393, 0.0415, 0.00611, 0.843, 0.576, 0.804, 0.304 ),
WT_1h_2 = c(0.898, 0.509, 0.0192, 0.729, 0.616, 0.902, 0.811, 0.9, 0.343, 0.712, 0.814, 0.901, 0.0446, 0.816, 0.896, 0.217, 0.747, 0.0143, 0.000964, 0.901, 0.776, 0.737, 0.876 ),
WT_1h_3 = c(0.939, 0.627, 0.0104, 0.867, 0.932, 0.935, 0.91, 0.939, 0.803, 0.926, 0.934, 0.888, 0.813, 0.859, 0.905, 0.864, 0.838, 0.0223, 0.00917, 0.802, 0.858, 0.724, 0.938 ),
WT_1h_4 = c(0.911, 0.782, 0.298, 0.396, 0.837, 0.871, 0.727, 0.91, 0.506, 0.88, 0.89, 0.909, 0.723, 0.896, 0.547, 0.887, 0.824, 0.566, 0.175, 0.814, 0.348, 0.869, 0.893),
WT_2h_1 = c(0.748, 0.911, 0.231, 0.929, 0.917, 0.928, 0.903, 0.801, 0.909, 0.849, 0.878, 0.884, 0.183, 0.925, 0.928, 0.719, 0.941, 0.108, 0.00817, 0.926, 0.678, 0.923, 0.884),
WT_2h_2 = c(0.935, 0.851, 0.163, 0.925, 0.951, 0.952, 0.63, 0.963, 0.926, 0.916, 0.925, 0.804, 0.868, 0.931, 0.961, 0.951, 0.92, 0.0706, 0.000265, 0.95, 0.917, 0.947, 0.951),
WT_2h_3 = c(0.0197, 0.894, 0.000613, 0.911, 0.922, 0.877, 0.122, 0.916, 0.739, 0.0125, 0.718, 0.905, 0.801, 0.875, 0.852, 0.91, 0.302, 0.729, 0.00015, 0.923, 0.731, 0.902, 0.504),
WT_2h_4 = c(0.696, 0.765, 0.0142, 0.931, 0.893, 0.931, 0.925, 0.925, 0.87, 0.45, 0.899, 0.908, 0.144, 0.921, 0.899, 0.631, 0.87, 0.62, 0.0014, 0.926, 0.807, 0.844, 0.865),
WT_6h_1 = c(0.898, 0.727, 0.00395, 0.921, 0.881, 0.924, 0.776, 0.919, 0.542, 0.234, 0.901, 0.67, 0.747, 0.83, 0.919, 0.848, 0.841, 0.056, 0.00144, 0.846, 0.815, 0.888, 0.916),
WT_6h_2 = c(2.38e-09, 0.88, 0.708, 0.898, 0.891, 0.768, 0.443, 0.777, 0.843, 0.505, 0.695, 0.842, 0.208, 0.859, 0.794, 0.813, 0.14, 0.887, 0.326, 0.894, 0.661, 0.775, 0.182),
WT_6h_3 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
WT_6h_4 = c(0.0357, 0.953, 0.792, 0.956, 0.967, 0.96, 0.711, 0.892, 0.931, 0.899, 0.866, 0.946, 0.917, 0.799, 0.925, 0.927, 0.938, 0.72, 0.025, 0.967, 0.936, 0.945, 0.923)),
class = "data.frame", row.names = c(NA, -23L))
I want to summarize data for each organelle (averaged by organelle and samples' replicates) and plot the Wildtype and mutant data side by side with standard error for each time point
df <-
melted <- melt(df)
head(melted)
melted$variable<- str_replace_all(melted$variable, '_[0-9]$', '')
melted$variable <- factor(melted$variable,levels=c("WT_1h","AOX_1h","WT_2h","AOX_2h","WT_6h","AOX_6h"))
my_comparisons <- list( c("WT_1h","AOX_1h"), c("WT_2h","AOX_2h"),c("WT_6h","AOX_6h"))
ggbarplot(melted, x = "variable", y = "value", add = "mean_se",
color = "variable", palette = c("grey","black","grey","black","grey","black"),
facet.by = "Location")+
stat_compare_means(comparisons = my_comparisons, label = "p.signif")
How can I use tidyverse (dplyr / tidyr) for this purpose?
How can I use tidyverse (dplyr / tidyr) to follow this pathway instead of above scripts?
You can use different functions to normalise this data. I use gather() in this example alongside stringr functions to extract the data from the character vector that has 3 columns of data in it.
dat %>%
gather(key, value, -ID, -Location) %>%
mutate(type = map_chr(str_split(key,"_"),~.x[1]),
hour = map_chr(str_split(key,"_"),~.x[2]),
n = map_chr(str_split(key,"_"),~.x[3])) %>%
group_by(type, hour) %>%
summarise(mean = mean(value))
Gives
# A tibble: 6 x 3
# Groups: type [?]
type hour mean
<chr> <chr> <dbl>
1 AOX 1h 0.3235302
2 AOX 2h 0.2709910
3 AOX 6h 0.2226648
4 WT 1h 0.6633866
5 WT 2h 0.7263108
6 WT 6h 0.7915662
This you can use in ggplot() to make a nice barplot.
To get it in a table you can use
dat %>%
gather(key, value, -ID, -Location) %>%
mutate(type = map_chr(str_split(key,"_"),~.x[1]),
hour = map_chr(str_split(key,"_"),~.x[2]),
n = map_chr(str_split(key,"_"),~.x[3])) %>%
group_by(type, hour) %>%
summarise(mean = mean(value)) %>%
spread(type, mean)
to get
# A tibble: 3 x 3
hour AOX WT
* <chr> <dbl> <dbl>
1 1h 0.3235302 0.6633866
2 2h 0.2709910 0.7263108
3 6h 0.2226648 0.7915662
Another version going from the df object:
The df object is a list, and expression values after cbind are character type, so you can do
tb <- as_tibble(do.call(cbind, df)) %>%
mutate_at(3:14, as.numeric)
NB that usually for gene expression data it is easier to read in count data using read_tsv or read.table and combine into matrix, data.frame or tibble.
NBB the df object specified has no "WT" samples (from my copy/paste anyway) so I renamed last 4 samples in tb as "WT_1h" replicates
colnames(tb)[11:14] <- paste0("WT_1h_",c(1:4))
Create means from replicates by function
rowMeanNrep <- function(tb, nm){
varname <- paste0(nm, "_mean")
selectn <- grep(nm, colnames(tb))
tb %>%
dplyr::mutate(!!varname := rowMeans(dplyr::select(., !!selectn)))
}
Specify which timepoints to use, and apply
tps <- c("AOX_1h", "WT_1h")
tb_1h_mean <- cbind(tb_1h[,1:2],
do.call(cbind, lapply(tps, function(f){
rowMeanNrep(tb=tb, nm=f) %>%
dplyr::select(paste0(f, "_mean"))
}))
)
A final NB, think about using boxplots instead of barplots, see this paper

how do you draw 3d charts in R with different colors

I have this data frame called t:
dput(t)
structure(list(timestamp = structure(c(1466306383, 1466306445,
1466306507, 1466306569, 1466306631, 1466306693, 1466306755, 1466306817,
1466306879, 1466306943, 1466307006, 1466307068, 1466307130, 1466307193,
1466307255, 1466307317, 1466307379, 1466307442, 1466307504, 1466307566
), class = c("POSIXct", "POSIXt"), tzone = ""), cpuused = c(1.13007,
1.13007, 1.13002, 1.12996, 1.1299, 1.12985, 1.12979, 1.12976,
1.1297, 1.12965, 1.12959, 1.12953, 1.12947, 1.12942, 1.12936,
1.1293, 1.12927, 1.12921, 1.12915, 1.1291), transratepersec = c(2640.77,
2640.61, 2640.44, 2640.28, 2640.12, 2639.95, 2639.79, 2639.69,
2639.53, 2639.36, 2639.19, 2639.03, 2638.86, 2638.7, 2638.54,
2638.37, 2638.21, 2638.04, 2637.88, 2637.72), reqpersec = c(0.172818,
0.172806, 0.172793, 0.172779, 0.172766, 0.172752, 0.172739, 0.172727,
0.172714, 0.1727, 0.172687, 0.172673, 0.17266, 0.172646, 0.172633,
0.17262, 0.172608, 0.172594, 0.172581, 0.172567), resptime = c(0.274,
0.235, 0.234, 0.234, 0.236, 0.234, 0.235, 0.236, 0.236, 0.233,
0.267, 0.235, 0.243, 0.235, 0.232, 0.233, 0.31, 0.233, 0.26,
0.234)), .Names = c("timestamp", "cpuused", "transratepersec",
"reqpersec", "resptime"), row.names = c(11653L, 19385L, 2624L,
16106L, 13990L, 12724L, 9490L, 12720L, 19387L, 11656L, 13988L,
9488L, 11650L, 5639L, 16104L, 15090L, 17156L, 4856L, 12722L,
382L), class = "data.frame")
I would like to create a 3 dimensional chart with x=reqpersec, y=cpuused, z=transrateperse, having different colors of x,y and x,z.
I have tried this:
library(rgl)
attach(t)
plot3d(x=reqpersec, y=cpuused, z=transratepersec, type="p", col="red", xlab="ReqPerSec", ylab="CPU", zlab="TransRate", size=5, lwd=15, box=F)
It is only giving me one color. Any ideas how to easily do this?
I would like images to be similar to this:

Remove margin inside plot pf ggplot2

this is my script and the associated plot:
library(ggplot2)
library(reshape)
df <- structure(list(ID = structure(1:19, .Label = c("2818/22/0834",
"2818/22/0851", "2818/22/0853", "2818/22/0886", "B0F", "B12T",
"B1T", "B21T", "B22F", "B26T", "B33F", "B4F", "P1", "P21", "P24",
"P25", "P27", "P28", "P29"), class = "factor"), K = c(0.089,
0.094, 0.096, 0.274, 0.09, 0.312, 0.33, 0.178, 0.05, 0.154, 0.083,
0.098, 0.035, 0.084, 0.053, 0.061, 0.043, 0.094, 0.101), Na = c(2.606,
3.822, 4.977, 2.522, 15.835, 83.108, 52.041, 41.448, 11.849,
40.531, 5.854, 10.151, 3.52, 8.445, 5.273, 7.246, 6.177, 14.813,
15.569), Cl = c(3.546, 6.181, 8.422, 3.733, 14.685, 96.911, 65.518,
79.01, 10.349, 53.361, 6.12, 10.832, 2.313, 10.312, 5.641, 8.708,
6.138, 12.302, 20.078), Mg = c(1.487, 1.773, 1.992, 1.143, 2.991,
1.678, 2.23, 3.288, 1.148, 2.428, 3.428, 2.729, 0.777, 2.554,
2.374, 4.075, 1.993, 1.881, 3.034), Ca = c(5.529, 6.205, 6.59,
4.099, 10.631, 4.564, 6.652, 13.374, 4.332, 10.542, 11.194, 10.053,
2.969, 7.73, 8.163, 11.539, 6.166, 5.968, 9.299), SO4 = c(0.663,
0.831, 0.607, 0.882, 9.013, 0.896, 0.652, 0.021, 1.446, 0.012,
8.832, 6.665, 1.003, 2.575, 3.685, 7.121, 3.64, 5.648, 2.397),
HCO3 = c(7.522, 5.498, 6.15, 5.242, 8.582, 4.067, 5.65, 9.364,
5.435, 8.068, 9.054, 8.326, 4.805, 7.235, 7.488, 9.234, 6.352,
6.98, 8.34)), .Names = c("ID", "K", "Na", "Cl", "Mg", "Ca",
"SO4", "HCO3"), class = "data.frame", row.names = c(NA, -19L))
df_melted<-melt(df, na.rm=T)
ggplot(df_melted, aes(variable, value, group=ID, color=ID))+
geom_point(size=2) +
geom_line() +
theme(legend.position="none") +
scale_y_log10(breaks=seq(0, 100, 10))
Is there a way to remove the spaces at the beginning and at the end of the plot? I tried with xlim but the problem is that the x variable is not a numerical variable, so, something like xlim(c("K", "HCO3")) doesn't work.
This is a discrete scale, but you can still used the expand argument as follows. Whether the output looks acceptable or not is another matter. Play with the c(0,0) values until you find something that suits. Using 0.1 for the second value gives a slightly better plot, in my view...
ggplot(df_melted, aes(variable, value, group=ID, color=ID))+
geom_point(size=2) +
geom_line() +
theme(legend.position="none") +
scale_y_log10(breaks=seq(0, 100, 10)) +
scale_x_discrete(expand = c(0,0))

Resources