Related
I want to figure out how to find the variable cutoff points for a glm. Cutpointr does this really well for single variable but with more complex models how would I go about extracting this info? Say I did ~ glucose + age + mass and wanted at this cutoff values x, y and z are present or is this not possible? would it be more like this number is the summed cutoff of the variables? Also sorry for no images it wouldn't let me add them in!
#for reprex
library(reprex)
library(mlbench)
library(tidymodels)
library(cutpointr)
#>
#> Attaching package: 'cutpointr'
#> The following objects are masked from 'package:yardstick':
#>
#> accuracy, npv, ppv, precision, recall, sensitivity, specificity
library(ggplot2)
library(tidyverse)
data(PimaIndiansDiabetes2)
head(PimaIndiansDiabetes2)
#> pregnant glucose pressure triceps insulin mass pedigree age diabetes
#> 1 6 148 72 35 NA 33.6 0.627 50 pos
#> 2 1 85 66 29 NA 26.6 0.351 31 neg
#> 3 8 183 64 NA NA 23.3 0.672 32 pos
#> 4 1 89 66 23 94 28.1 0.167 21 neg
#> 5 0 137 40 35 168 43.1 2.288 33 pos
#> 6 5 116 74 NA NA 25.6 0.201 30 neg
Diabetes <- na.omit(PimaIndiansDiabetes2)
glimpse(PimaIndiansDiabetes2)
#> Rows: 768
#> Columns: 9
#> $ pregnant <dbl> 6, 1, 8, 1, 0, 5, 3, 10, 2, 8, 4, 10, 10, 1, 5, 7, 0, 7, 1, 1…
#> $ glucose <dbl> 148, 85, 183, 89, 137, 116, 78, 115, 197, 125, 110, 168, 139,…
#> $ pressure <dbl> 72, 66, 64, 66, 40, 74, 50, NA, 70, 96, 92, 74, 80, 60, 72, N…
#> $ triceps <dbl> 35, 29, NA, 23, 35, NA, 32, NA, 45, NA, NA, NA, NA, 23, 19, N…
#> $ insulin <dbl> NA, NA, NA, 94, 168, NA, 88, NA, 543, NA, NA, NA, NA, 846, 17…
#> $ mass <dbl> 33.6, 26.6, 23.3, 28.1, 43.1, 25.6, 31.0, 35.3, 30.5, NA, 37.…
#> $ pedigree <dbl> 0.627, 0.351, 0.672, 0.167, 2.288, 0.201, 0.248, 0.134, 0.158…
#> $ age <dbl> 50, 31, 32, 21, 33, 30, 26, 29, 53, 54, 30, 34, 57, 59, 51, 3…
#> $ diabetes <fct> pos, neg, pos, neg, pos, neg, pos, neg, pos, pos, neg, pos, n…
cp <- cutpointr(Diabetes, glucose, diabetes,
method = maximize_metric, metric = sum_sens_spec)
#> Assuming the positive class is pos
#> Assuming the positive class has higher x values
plot_cutpointr(cp)
summary(cp)
#> Method: maximize_metric
#> Predictor: glucose
#> Outcome: diabetes
#> Direction: >=
#>
#> AUC n n_pos n_neg
#> 0.8058 392 130 262
#>
#> optimal_cutpoint sum_sens_spec acc sensitivity specificity tp fn fp tn
#> 128 1.5055 0.7628 0.7231 0.7824 94 36 57 205
#>
#> Predictor summary:
#> Data Min. 5% 1st Qu. Median Mean 3rd Qu. 95% Max. SD NAs
#> Overall 56 81.0 99.00 119.0 122.6276 143.00 181.00 198 30.86078 0
#> neg 56 79.0 94.00 107.5 111.4313 126.00 154.00 197 24.64213 0
#> pos 78 95.9 124.25 144.5 145.1923 171.75 188.55 198 29.83939 0
res_unnested <- cp %>%
unnest(cols = roc_curve)
annotation <- paste0("AUC: ", round(cp$AUC, 2), "\n",
"Cutpoint: ", round(cp$optimal_cutpoint, 2))
ggplot(res_unnested, aes(x = 1 - tnr, y = tpr)) +
xlab("1 - Specificity") +
ylab("Sensitivity") +
theme_bw() +
theme(aspect.ratio = 1) +
geom_line(color = "red") +
geom_vline(xintercept = 1 - cp$specificity, linetype = "dotted") +
geom_hline(yintercept = cp$sensitivity, linetype = "dotted") +
annotate("text", x = 0.85, y = 0.05, label = annotation) +
ggtitle("ROC curve", "Using glucose mg/dL as a predictive logistic variable for diabetes") +
geom_abline(intercept = 0, slope = 1, linetype = 2)
ROC(form = diabetes ~ glucose + age + mass, data=Diabetes, plot = "ROC", MX = T)
I have tried to add more parameters to cutpointr which was unsuccessful. I have also tried to run with Epi and saw a better AUC with age and mass included. I have also run a glm but I am just not sure how to properly analyze the glm for this type of information. Looking on the tidymodels website for help while waiting for suggestions, thanks!
I have a data.frame with many rows. I am trying to produce a new data.frame summarizing the total row count for all combinations of V_ID and N_ID.
In the below, df1 is an example of my data and df2 is an example of the desired output.
df1 <- data.frame (V_ID = c(1234, 5252, 1234, 1234, 1234, 5252, 5252, 6754),
N_ID = c(45, 23, 45, 45, 45, 22, 23, 11),
Length = c(88, 33, 88, 88, 88, 33, 33, 22)
)
df2 <- data.frame (V_ID = c(1234, 5252, 5252, 6754),
N_ID = c(45, 23, 22, 11),
Num_Times=c(4, 2, 1, 1),
Length = c(88, 33, 33, 22)
)
Is there a way to do this in dplyr?
You can use count() from the dplyr package.
library(dplyr)
df1 %>% count(V_ID, N_ID, Length, name = "Num_Times")
#> V_ID N_ID Length Num_Times
#> 1 1234 45 88 4
#> 2 5252 22 33 1
#> 3 5252 23 33 2
#> 4 6754 11 22 1
I have two data.frames df1 with raw data. df2 has information on where to look in df1.
df1 has groups, defined by "id". In those groups, a subset is defined by df2$value_a1 and df2$value_a2, which represent the range of rows to look in the group. In that subsetgroup I want to find the maximum value_a, to select value_b.
code for df1 and df2
df1 <- data.frame("id" = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3), "value_a" = c(0, 10, 21, 30, 43, 53, 69, 81, 93, 5, 16, 27, 33, 45, 61, 75, 90, 2, 11, 16, 24, 31, 40, 47, 60, 75, 88), "value_b" = c(100, 101, 100, 95, 90, 104, 88, 84, 75, 110, 105, 106, 104, 95, 109, 96, 89, 104, 104, 104, 103, 106, 103, 101, 99, 98, 97), "value_c" = c(0, -1, -2, -2, -2, -2, -1, -1, 0, 0, 0, 0, 1, 1, 2, 2, 1, -1, 0, 0, 1, 1, 2, 2, 1, 1, 0), "value_d" = c(1:27))
df2 <- data.frame("id" = c(1, 2, 3), "value_a1" = c(21, 33, 16), "value_a2" = c(69, 75, 60))
This is df1
id value_a value_b value_c value_d
1 1 0 100 0 1
2 1 10 101 -1 2
3 1 21 100 -2 3
4 1 30 95 -2 4
5 1 43 90 -2 5
6 1 53 104 -2 6
7 1 69 88 -1 7
8 1 81 84 -1 8
9 1 93 75 0 9
10 2 5 110 0 10
11 2 16 105 0 11
12 2 27 106 0 12
13 2 33 104 1 13
14 2 45 95 1 14
15 2 61 109 2 15
16 2 75 96 2 16
17 2 90 89 1 17
18 3 2 104 -1 18
19 3 11 104 0 19
20 3 16 104 0 20
21 3 24 103 1 21
22 3 31 106 1 22
23 3 40 103 2 23
24 3 47 101 2 24
25 3 60 99 1 25
26 3 75 98 1 26
27 3 88 97 0 27
This is df2
id value_a1 value_a2
1 1 21 69
2 2 33 75
3 3 16 60
My result would be df3, which would look like this
id value_a value_c
1 1 53 -2
2 2 61 2
3 3 31 1
I wrote this code to show my line of thinking.
df3 <- df1 %>%
group_by(id) %>%
filter(value_a >= df2$value_a1 & value_a <= df2$value_a2) %>%
filter(value_a == max(value_a)) %>%
pull(value_b)
This however generates a value with three entry's:
[1] 88 95 99
These are not the maximum value_b's...
Perhaps by() would work, but this gets stuck on using a function on two different df's.
It feels like I'm almost there, but still far away...
You can try this. I hope this helps.
df1 %>% left_join(df2) %>% mutate(val=ifelse(value_a>value_a1 & value_a<value_a2,value_b,NA)) %>%
group_by(id) %>% summarise(val=max(val,na.rm=T))
# A tibble: 3 x 2
id val
<dbl> <dbl>
1 1 104
2 2 109
3 3 106
Why don't you try a merge?
Then with data.table syntax:
library(data.table)
df3 <- merge(df1, df2, by = "id", all.x = TRUE)
max_values <- df3[value_a > value_a1 & value_a < value_a2, max(value_b), by = "id"]
max_values
# id V1
# 1: 1 104
# 2: 2 109
# 3: 3 106
I would do this using data.table package since is just what I'm used to
library(data.table)
dt.1 <- data.table("id" = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3), "value_a" = c(0, 10, 21, 30, 43, 53, 69, 81, 93, 5, 16, 27, 33, 45, 61, 75, 90, 2, 11, 16, 24, 31, 40, 47, 60, 75, 88), "value_b" = c(100, 101, 100, 95, 90, 104, 88, 84, 75, 110, 105, 106, 104, 95, 109, 96, 89, 104, 104, 104, 103, 106, 103, 101, 99, 98, 97), "value_c" = c(0, -1, -2, -2, -2, -2, -1, -1, 0, 0, 0, 0, 1, 1, 2, 2, 1, -1, 0, 0, 1, 1, 2, 2, 1, 1, 0), "value_d" = c(1:27))
dt.2 <- data.table("id" = c(1, 2, 3), "value_a1" = c(21, 33, 16), "value_a2" = c(69, 75, 60))
dt.3 <- dt.1[id %in% dt.2[,id],max(value_b), by="id"]
setnames(dt.3, "V1", "max_value_b")
dt.3
To get corresponding line where b is the max values there are several ways, here's one where I only modified a line from the previous code
dt.1[id %in% dt.2[,id],.SD[which.max(value_b), .(value_a, value_b, value_c, value_d)], by="id"]
.SD means the sub-table you already selected with by so for each id selects the local max b and then returns a table which.max() selects the row, and finally .() is an alias for list, so lists the columns you wish from that table.
Perhaps a more readable approach is to first select the desired rows
max.b.rows <- dt.1[id %in% dt.2[,id], which.max(value_b), by="id"][,V1]
dt.3 <- dt.1[max.b.rows,]
BTW, the id %in% dt.2[,id] part is just there to make sure you only select maxima for those ids in table 2
Best
I have a table I have read using the function read_csv. As an example, I have created a small data.frame to illustrate the structure.
In reality, the table is the count of 48160 bacterias (ASV_1 to ASV_48160 - rows) found in different samples (168 columns). The samples go from sample 1 - S1(a, b, c) - to sample 56 - S56(a, b, c) (a, b, and c are sub-samples).
My objective is to sum the a, b, c in every sample and get just one value. So, instead of me having S1a, S1b, S1c, I want to sum rows and have S1, keeping the same number of rows, and do the same with the 56 samples. Eventually, I want to keep just columns S1, S2, S3...S56 and erase the columns with a, b, c.
Bacteria <- data.frame(
ID=c("ASV_1", "ASV_2", "ASV_3", "ASV_4", "ASV_5", "ASV_6", "ASV_7", "ASV_8", "ASV_9", "ASV_10"),
s1a=c(135, 249, 142, 130, 20, 0, 0, 10, 85, 32),
s1b=c(14, 85, 0, 12, 0, 59, 0, 0, 96, 43),
s1c= c(0, 2, 8, 56, 145, 25, 65, 32, 25, 0),
s2a= c(54, 88, 65, 14, 0, 0, 4, 165, 87, 0),
s2b= c(33, 0, 74, 19, 22, 0, 5, 0, 3, 10),
s2c= c(178, 0, 1, 1, 0, 1, 66, 3, 0, 221),
s3a=c(135, 249, 12, 130, 20, 110, 0, 10, 55, 32),
s3b= c(99, 88, 65, 58, 0, 0, 4, 5, 87, 0),
s3c=c(90, 54, 6, 8, 0, 11, 91, 1,9, 0))
str(Bacteria)
I have tried some options with mutate from tidyverse, but that works for one sample (or at least I don't know how to repeat the operation for the 56 samples), and I don't want to do it manually from 1 to 56
Bacteria <- Bacteria %>%
mutate(s1=s1a+s1b+s1c)
I have also tried rowSums, but again I don't know how to repeat this line of code for the other samples. (I am a beginner in R).
Please, Does anyone know a better option? I appreciate the help!
One approach would be to pivot the data to long form, remove the subsample identifier and then pivot back to wide using the values_fn argument to sum the common values.
library(dplyr)
library(tidyr)
Bacteria %>%
pivot_longer(-ID) %>%
mutate(name = sub("[a-c]$", "", name)) %>%
pivot_wider(values_from = value, names_from = name, values_fn = list(value = sum))
# A tibble: 10 x 4
ID s1 s2 s3
<chr> <dbl> <dbl> <dbl>
1 ASV_1 149 265 324
2 ASV_2 336 88 391
3 ASV_3 150 140 83
4 ASV_4 198 34 196
5 ASV_5 165 22 20
6 ASV_6 84 1 121
7 ASV_7 65 75 95
8 ASV_8 42 168 16
9 ASV_9 206 90 151
10 ASV_10 75 231 32
I would like to modify the answer to the question here or have a new solution to include another column which shows the second largest consecutive run of "0". My sample data and code is below, the function is operating on the month columns and the second largest run column is what I hope to add. I am working with a large dataset so the more efficient the better, any ideas are appreciated, thanks.
sample data
structure(list(ID = c(1, 2, 3, 4, 5, 6, 7, 8, 9), V1 = c("A",
"B", "A", "B", "B", "A", "A", "B", "B"), V2 = c(21, 233, 185,
85, 208, 112, 238, 66, 38), V3 = c(149, 250, 218, 104, 62, 19,
175, 168, 28), Jan = c(10, 20, 10, 12, 76, 28, 137, 162, 101),
Feb = c(20, 25, 15, 0, 89, 0, 152, 177, 119), March = c(0,
28, 20, 14, 108, 0, 165, 194, 132), April = c(0, 34, 25,
16, 125, 71, 181, 208, 149), May = c(25, 0, 30, 22, 135,
0, 191, 224, 169), June = c(29, 0, 35, 24, 145, 0, 205, 244,
187), July = c(34, 0, 40, 28, 163, 0, 217, 256, 207), August = c(37,
0, 45, 29, 173, 0, 228, 276, 221), Sep = c(0, 39, 50, 31,
193, 0, 239, 308, 236), Oct = c(0, 48, 55, 35, 210, 163,
252, 0, 247), Nov = c(48, 55, 60, 40, 221, 183, 272, 0, 264
), Dec = c(50, 60, 65, 45, 239, 195, 289, 0, 277), `Second largest run` = c(1,
NA, NA, NA, NA, 2, NA, NA, NA), result = c(2, 4, -Inf, 1,
-Inf, 5, -Inf, 3, -Inf)), row.names = c(NA, -9L), class = c("tbl_df",
"tbl", "data.frame"))
code
most_consecutive_val = function(x, val = 0) {
with(rle(x), max(lengths[values == val]))
}
test$result=apply(test[,-c(1:4,17)], MARGIN = 1, most_consecutive_val)
Rather than taking the max from the run length encoding (rle) function, we want to sort the output and then extract the desired index. We'll get NA's when we request an index that doesn't exist -- where there isn't a second run of zeroes in row 2 for example.
ordered_runs = function(x, val = 0, idx = 1) {
with(rle(x), sort(lengths[values == val], decreasing = TRUE))[idx]
}
test$result_1 <- apply(test[,-c(1:4,17:18)], MARGIN = 1, ordered_runs, idx = 1)
test$result_2 <- apply(test[,-c(1:4,17:18)], MARGIN = 1, ordered_runs, idx = 2)
Output is slightly different than your expected -- (1) using NA's rather than -Inf, and (2) in your first row, where I believe there is a tie with a second run of 2 zeroes.
> test[,c(1,17:20)]
# A tibble: 9 x 5
ID `Second largest run` result result_1 result_2
<dbl> <dbl> <dbl> <int> <int>
1 1 1 2 2 2
2 2 NA 4 4 NA
3 3 NA -Inf NA NA
4 4 NA 1 1 NA
5 5 NA -Inf NA NA
6 6 2 5 5 2
7 7 NA -Inf NA NA
8 8 NA 3 3 NA
9 9 NA -Inf NA NA
Here is an option using data.table which should be quite fast for OP's large dataset and also identifies all sequences of zeros simultaneously:
library(data.table)
setDT(DF)
cols <- c("Jan", "Feb", "March", "April", "May", "June", "July", "August", "Sep", "Oct", "Nov", "Dec")
#convert into a long format
m <- melt(DF, measure.vars=cols)[
#identify consecutive sequences of the same number and count
order(ID), c("rl", "rw") := .(rl <- rleid(ID, value), rowid(rl))][
#extract the last element where values = 0 (that is the length of sequences of zeros)
value == 0L, .(ID=ID[.N], len=rw[.N]), rl][
#sort in descending order for length of sequences
order(ID, -len)]
#pivot into wide format and perform a update join
wide <- dcast(m, ID ~ rowid(ID), value.var="len")
DF[wide, on=.(ID), (names(wide)) := mget(names(wide))]
output:
ID V1 V2 V3 Jan Feb March April May June July August Sep Oct Nov Dec 1 2
1: 1 A 21 149 10 20 0 0 25 29 34 37 0 0 48 50 2 2
2: 2 B 233 250 20 25 28 34 0 0 0 0 39 48 55 60 4 NA
3: 3 A 185 218 10 15 20 25 30 35 40 45 50 55 60 65 NA NA
4: 4 B 85 104 12 0 14 16 22 24 28 29 31 35 40 45 1 NA
5: 5 B 208 62 76 89 108 125 135 145 163 173 193 210 221 239 NA NA
6: 6 A 112 19 28 0 0 71 0 0 0 0 0 163 183 195 5 2
7: 7 A 238 175 137 152 165 181 191 205 217 228 239 252 272 289 NA NA
8: 8 B 66 168 162 177 194 208 224 244 256 276 308 0 0 0 3 NA
9: 9 B 38 28 101 119 132 149 169 187 207 221 236 247 264 277 NA NA
data:
DF <- structure(list(ID = c(1, 2, 3, 4, 5, 6, 7, 8, 9), V1 = c("A",
"B", "A", "B", "B", "A", "A", "B", "B"), V2 = c(21, 233, 185,
85, 208, 112, 238, 66, 38), V3 = c(149, 250, 218, 104, 62, 19,
175, 168, 28), Jan = c(10, 20, 10, 12, 76, 28, 137, 162, 101),
Feb = c(20, 25, 15, 0, 89, 0, 152, 177, 119), March = c(0,
28, 20, 14, 108, 0, 165, 194, 132), April = c(0, 34, 25,
16, 125, 71, 181, 208, 149), May = c(25, 0, 30, 22, 135,
0, 191, 224, 169), June = c(29, 0, 35, 24, 145, 0, 205, 244,
187), July = c(34, 0, 40, 28, 163, 0, 217, 256, 207), August = c(37,
0, 45, 29, 173, 0, 228, 276, 221), Sep = c(0, 39, 50, 31,
193, 0, 239, 308, 236), Oct = c(0, 48, 55, 35, 210, 163,
252, 0, 247), Nov = c(48, 55, 60, 40, 221, 183, 272, 0, 264
), Dec = c(50, 60, 65, 45, 239, 195, 289, 0, 277), `1` = c(2L,
4L, NA, 1L, NA, 5L, NA, 3L, NA), `2` = c(2L, NA, NA, NA,
NA, 2L, NA, NA, NA)), row.names = c(NA, -9L), class = "data.frame")