Map function to iterate t.test - r

I have a database a little bit tidier than the original thanks to a StackOverflow colleague I'm really grateful
The data frame is composed of a patient ID which has several measurements along time (00 = basal, 66 = 6 months, 01 = 12 months). I know I put the data frame not chronologically ordered
df1<-data.frame(pacient<- c(6430, 6430, 6430, 6494, 6494, 6494, 6165, 6165, 6165),
time<- c(00, 01, 66, 00, 01, 66, 00, 01, 66),
weight <- c(115, 112, 110, 98, 95, 94, 88, 87, 86),
waist <- c(123, NA, 112, 115, 112, 113, 112, 110, NA),
p14_total<- c(7, NA, 4, 12, 5, NA, 15, 12, 13))
t.test
I am trying to perform comparisons between the measurements between the different time points, NOT paired. For example weight at 00 vs weight at 66 / weight at 00 vs weight at 01 /weight at 66 vs weight at 01
I am looking for a data frame or data.table to export with the statistics (t, pvalue, mean..)
Create a column with the difference between the different time measurements for each patient.
For example: patient ID: 6430
Weight_6months = Weight01 - Weight66
Weight_12months= Weight01 - Weight00
I am really trying to perform this with purrr::map functions but I'm not reaching the objective

If I understand the question, then a simple solution is pivot the data wider, then perform the differencing between years:
NB If there are lots of years, then using mutate() with across(), allows selection of the columns, without specifying them.
library(tidyverse)
df1<-data.frame(pacient = c(6430, 6430, 6430, 6494, 6494, 6494, 6165, 6165, 6165),
time = c(00, 01, 66, 00, 01, 66, 00, 01, 66),
weight = c(115, 112, 110, 98, 95, 94, 88, 87, 86),
waist = c(123, NA, 112, 115, 112, 113, 112, 110, NA),
p14_total = c(7, NA, 4, 12, 5, NA, 15, 12, 13)) %>%
as_tibble()
df2 <- df1 %>%
group_by(pacient) %>%
pivot_wider(names_from = time, values_from = c(weight, waist, p14_total)) %>%
rowwise() %>%
mutate(weight_diff_1 = weight_66 - weight_1, weight_diff_2 = weight_1 - weight_0)
# A tibble: 3 x 12
# Rowwise: pacient
pacient weight_0 weight_1 weight_66 waist_0 waist_1 waist_66 p14_total_0 p14_total_1 p14_total_66 weight_diff_1 weight_diff_2
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 6430 115 112 110 123 NA 112 7 NA 4 -2 -3
2 6494 98 95 94 115 112 113 12 5 NA -1 -3
3 6165 88 87 86 112 110 NA 15 12 13 -1 -1
And the t.test on the columns
t.test(df2$waist_0, df2$weight_1)
Welch Two Sample t-test
data: df2$waist_0 and df2$weight_1
t = 2.3133, df = 2.7634, p-value = 0.1112
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-8.302625 45.635958
sample estimates:
mean of x mean of y
116.6667 98.0000

Related

How to find specific multi-variable thresholds for AUC cutpoint?

I want to figure out how to find the variable cutoff points for a glm. Cutpointr does this really well for single variable but with more complex models how would I go about extracting this info? Say I did ~ glucose + age + mass and wanted at this cutoff values x, y and z are present or is this not possible? would it be more like this number is the summed cutoff of the variables? Also sorry for no images it wouldn't let me add them in!
#for reprex
library(reprex)
library(mlbench)
library(tidymodels)
library(cutpointr)
#>
#> Attaching package: 'cutpointr'
#> The following objects are masked from 'package:yardstick':
#>
#> accuracy, npv, ppv, precision, recall, sensitivity, specificity
library(ggplot2)
library(tidyverse)
data(PimaIndiansDiabetes2)
head(PimaIndiansDiabetes2)
#> pregnant glucose pressure triceps insulin mass pedigree age diabetes
#> 1 6 148 72 35 NA 33.6 0.627 50 pos
#> 2 1 85 66 29 NA 26.6 0.351 31 neg
#> 3 8 183 64 NA NA 23.3 0.672 32 pos
#> 4 1 89 66 23 94 28.1 0.167 21 neg
#> 5 0 137 40 35 168 43.1 2.288 33 pos
#> 6 5 116 74 NA NA 25.6 0.201 30 neg
Diabetes <- na.omit(PimaIndiansDiabetes2)
glimpse(PimaIndiansDiabetes2)
#> Rows: 768
#> Columns: 9
#> $ pregnant <dbl> 6, 1, 8, 1, 0, 5, 3, 10, 2, 8, 4, 10, 10, 1, 5, 7, 0, 7, 1, 1…
#> $ glucose <dbl> 148, 85, 183, 89, 137, 116, 78, 115, 197, 125, 110, 168, 139,…
#> $ pressure <dbl> 72, 66, 64, 66, 40, 74, 50, NA, 70, 96, 92, 74, 80, 60, 72, N…
#> $ triceps <dbl> 35, 29, NA, 23, 35, NA, 32, NA, 45, NA, NA, NA, NA, 23, 19, N…
#> $ insulin <dbl> NA, NA, NA, 94, 168, NA, 88, NA, 543, NA, NA, NA, NA, 846, 17…
#> $ mass <dbl> 33.6, 26.6, 23.3, 28.1, 43.1, 25.6, 31.0, 35.3, 30.5, NA, 37.…
#> $ pedigree <dbl> 0.627, 0.351, 0.672, 0.167, 2.288, 0.201, 0.248, 0.134, 0.158…
#> $ age <dbl> 50, 31, 32, 21, 33, 30, 26, 29, 53, 54, 30, 34, 57, 59, 51, 3…
#> $ diabetes <fct> pos, neg, pos, neg, pos, neg, pos, neg, pos, pos, neg, pos, n…
cp <- cutpointr(Diabetes, glucose, diabetes,
method = maximize_metric, metric = sum_sens_spec)
#> Assuming the positive class is pos
#> Assuming the positive class has higher x values
plot_cutpointr(cp)
summary(cp)
#> Method: maximize_metric
#> Predictor: glucose
#> Outcome: diabetes
#> Direction: >=
#>
#> AUC n n_pos n_neg
#> 0.8058 392 130 262
#>
#> optimal_cutpoint sum_sens_spec acc sensitivity specificity tp fn fp tn
#> 128 1.5055 0.7628 0.7231 0.7824 94 36 57 205
#>
#> Predictor summary:
#> Data Min. 5% 1st Qu. Median Mean 3rd Qu. 95% Max. SD NAs
#> Overall 56 81.0 99.00 119.0 122.6276 143.00 181.00 198 30.86078 0
#> neg 56 79.0 94.00 107.5 111.4313 126.00 154.00 197 24.64213 0
#> pos 78 95.9 124.25 144.5 145.1923 171.75 188.55 198 29.83939 0
res_unnested <- cp %>%
unnest(cols = roc_curve)
annotation <- paste0("AUC: ", round(cp$AUC, 2), "\n",
"Cutpoint: ", round(cp$optimal_cutpoint, 2))
ggplot(res_unnested, aes(x = 1 - tnr, y = tpr)) +
xlab("1 - Specificity") +
ylab("Sensitivity") +
theme_bw() +
theme(aspect.ratio = 1) +
geom_line(color = "red") +
geom_vline(xintercept = 1 - cp$specificity, linetype = "dotted") +
geom_hline(yintercept = cp$sensitivity, linetype = "dotted") +
annotate("text", x = 0.85, y = 0.05, label = annotation) +
ggtitle("ROC curve", "Using glucose mg/dL as a predictive logistic variable for diabetes") +
geom_abline(intercept = 0, slope = 1, linetype = 2)
ROC(form = diabetes ~ glucose + age + mass, data=Diabetes, plot = "ROC", MX = T)
I have tried to add more parameters to cutpointr which was unsuccessful. I have also tried to run with Epi and saw a better AUC with age and mass included. I have also run a glm but I am just not sure how to properly analyze the glm for this type of information. Looking on the tidymodels website for help while waiting for suggestions, thanks!

Looking for an R function that counts number of times two columns appear together

I have a data.frame with many rows. I am trying to produce a new data.frame summarizing the total row count for all combinations of V_ID and N_ID.
In the below, df1 is an example of my data and df2 is an example of the desired output.
df1 <- data.frame (V_ID = c(1234, 5252, 1234, 1234, 1234, 5252, 5252, 6754),
N_ID = c(45, 23, 45, 45, 45, 22, 23, 11),
Length = c(88, 33, 88, 88, 88, 33, 33, 22)
)
df2 <- data.frame (V_ID = c(1234, 5252, 5252, 6754),
N_ID = c(45, 23, 22, 11),
Num_Times=c(4, 2, 1, 1),
Length = c(88, 33, 33, 22)
)
Is there a way to do this in dplyr?
You can use count() from the dplyr package.
library(dplyr)
df1 %>% count(V_ID, N_ID, Length, name = "Num_Times")
#> V_ID N_ID Length Num_Times
#> 1 1234 45 88 4
#> 2 5252 22 33 1
#> 3 5252 23 33 2
#> 4 6754 11 22 1

Find maximum in a group, subset by a subset from a different dataframe, to select other value's

I have two data.frames df1 with raw data. df2 has information on where to look in df1.
df1 has groups, defined by "id". In those groups, a subset is defined by df2$value_a1 and df2$value_a2, which represent the range of rows to look in the group. In that subsetgroup I want to find the maximum value_a, to select value_b.
code for df1 and df2
df1 <- data.frame("id" = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3), "value_a" = c(0, 10, 21, 30, 43, 53, 69, 81, 93, 5, 16, 27, 33, 45, 61, 75, 90, 2, 11, 16, 24, 31, 40, 47, 60, 75, 88), "value_b" = c(100, 101, 100, 95, 90, 104, 88, 84, 75, 110, 105, 106, 104, 95, 109, 96, 89, 104, 104, 104, 103, 106, 103, 101, 99, 98, 97), "value_c" = c(0, -1, -2, -2, -2, -2, -1, -1, 0, 0, 0, 0, 1, 1, 2, 2, 1, -1, 0, 0, 1, 1, 2, 2, 1, 1, 0), "value_d" = c(1:27))
df2 <- data.frame("id" = c(1, 2, 3), "value_a1" = c(21, 33, 16), "value_a2" = c(69, 75, 60))
This is df1
id value_a value_b value_c value_d
1 1 0 100 0 1
2 1 10 101 -1 2
3 1 21 100 -2 3
4 1 30 95 -2 4
5 1 43 90 -2 5
6 1 53 104 -2 6
7 1 69 88 -1 7
8 1 81 84 -1 8
9 1 93 75 0 9
10 2 5 110 0 10
11 2 16 105 0 11
12 2 27 106 0 12
13 2 33 104 1 13
14 2 45 95 1 14
15 2 61 109 2 15
16 2 75 96 2 16
17 2 90 89 1 17
18 3 2 104 -1 18
19 3 11 104 0 19
20 3 16 104 0 20
21 3 24 103 1 21
22 3 31 106 1 22
23 3 40 103 2 23
24 3 47 101 2 24
25 3 60 99 1 25
26 3 75 98 1 26
27 3 88 97 0 27
This is df2
id value_a1 value_a2
1 1 21 69
2 2 33 75
3 3 16 60
My result would be df3, which would look like this
id value_a value_c
1 1 53 -2
2 2 61 2
3 3 31 1
I wrote this code to show my line of thinking.
df3 <- df1 %>%
group_by(id) %>%
filter(value_a >= df2$value_a1 & value_a <= df2$value_a2) %>%
filter(value_a == max(value_a)) %>%
pull(value_b)
This however generates a value with three entry's:
[1] 88 95 99
These are not the maximum value_b's...
Perhaps by() would work, but this gets stuck on using a function on two different df's.
It feels like I'm almost there, but still far away...
You can try this. I hope this helps.
df1 %>% left_join(df2) %>% mutate(val=ifelse(value_a>value_a1 & value_a<value_a2,value_b,NA)) %>%
group_by(id) %>% summarise(val=max(val,na.rm=T))
# A tibble: 3 x 2
id val
<dbl> <dbl>
1 1 104
2 2 109
3 3 106
Why don't you try a merge?
Then with data.table syntax:
library(data.table)
df3 <- merge(df1, df2, by = "id", all.x = TRUE)
max_values <- df3[value_a > value_a1 & value_a < value_a2, max(value_b), by = "id"]
max_values
# id V1
# 1: 1 104
# 2: 2 109
# 3: 3 106
I would do this using data.table package since is just what I'm used to
library(data.table)
dt.1 <- data.table("id" = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3), "value_a" = c(0, 10, 21, 30, 43, 53, 69, 81, 93, 5, 16, 27, 33, 45, 61, 75, 90, 2, 11, 16, 24, 31, 40, 47, 60, 75, 88), "value_b" = c(100, 101, 100, 95, 90, 104, 88, 84, 75, 110, 105, 106, 104, 95, 109, 96, 89, 104, 104, 104, 103, 106, 103, 101, 99, 98, 97), "value_c" = c(0, -1, -2, -2, -2, -2, -1, -1, 0, 0, 0, 0, 1, 1, 2, 2, 1, -1, 0, 0, 1, 1, 2, 2, 1, 1, 0), "value_d" = c(1:27))
dt.2 <- data.table("id" = c(1, 2, 3), "value_a1" = c(21, 33, 16), "value_a2" = c(69, 75, 60))
dt.3 <- dt.1[id %in% dt.2[,id],max(value_b), by="id"]
setnames(dt.3, "V1", "max_value_b")
dt.3
To get corresponding line where b is the max values there are several ways, here's one where I only modified a line from the previous code
dt.1[id %in% dt.2[,id],.SD[which.max(value_b), .(value_a, value_b, value_c, value_d)], by="id"]
.SD means the sub-table you already selected with by so for each id selects the local max b and then returns a table which.max() selects the row, and finally .() is an alias for list, so lists the columns you wish from that table.
Perhaps a more readable approach is to first select the desired rows
max.b.rows <- dt.1[id %in% dt.2[,id], which.max(value_b), by="id"][,V1]
dt.3 <- dt.1[max.b.rows,]
BTW, the id %in% dt.2[,id] part is just there to make sure you only select maxima for those ids in table 2
Best

How can I sum consecutive columns nth times in a data.frame R

I have a table I have read using the function read_csv. As an example, I have created a small data.frame to illustrate the structure.
In reality, the table is the count of 48160 bacterias (ASV_1 to ASV_48160 - rows) found in different samples (168 columns). The samples go from sample 1 - S1(a, b, c) - to sample 56 - S56(a, b, c) (a, b, and c are sub-samples).
My objective is to sum the a, b, c in every sample and get just one value. So, instead of me having  S1a, S1b, S1c, I want to sum rows and have S1, keeping the same number of rows, and do the same with the 56 samples. Eventually, I want to keep just columns S1, S2, S3...S56 and erase the columns with a, b, c. 
Bacteria <- data.frame(
ID=c("ASV_1", "ASV_2", "ASV_3", "ASV_4", "ASV_5", "ASV_6", "ASV_7", "ASV_8", "ASV_9", "ASV_10"),
s1a=c(135, 249, 142, 130, 20, 0, 0, 10, 85, 32),
s1b=c(14, 85, 0, 12, 0, 59, 0, 0, 96, 43),
s1c= c(0, 2, 8, 56, 145, 25, 65, 32, 25, 0),
s2a= c(54, 88, 65, 14, 0, 0, 4, 165, 87, 0),
s2b= c(33, 0, 74, 19, 22, 0, 5, 0, 3, 10),
s2c= c(178, 0, 1, 1, 0, 1, 66, 3, 0, 221),
s3a=c(135, 249, 12, 130, 20, 110, 0, 10, 55, 32),
s3b= c(99, 88, 65, 58, 0, 0, 4, 5, 87, 0),
s3c=c(90, 54, 6, 8, 0, 11, 91, 1,9, 0))
str(Bacteria)
I have tried some options with mutate from tidyverse, but that works for one sample (or at least I don't know how to repeat the operation for the 56 samples), and I don't want to do it manually from 1 to 56
Bacteria <- Bacteria %>%
mutate(s1=s1a+s1b+s1c)
I have also tried rowSums, but again I don't know how to repeat this line of code for the other samples. (I am a beginner in R).
Please, Does anyone know a better option? I appreciate the help!
One approach would be to pivot the data to long form, remove the subsample identifier and then pivot back to wide using the values_fn argument to sum the common values.
library(dplyr)
library(tidyr)
Bacteria %>%
pivot_longer(-ID) %>%
mutate(name = sub("[a-c]$", "", name)) %>%
pivot_wider(values_from = value, names_from = name, values_fn = list(value = sum))
# A tibble: 10 x 4
ID s1 s2 s3
<chr> <dbl> <dbl> <dbl>
1 ASV_1 149 265 324
2 ASV_2 336 88 391
3 ASV_3 150 140 83
4 ASV_4 198 34 196
5 ASV_5 165 22 20
6 ASV_6 84 1 121
7 ASV_7 65 75 95
8 ASV_8 42 168 16
9 ASV_9 206 90 151
10 ASV_10 75 231 32

modify function to find the maximum length for a consecutive run of a number per row

I would like to modify the answer to the question here or have a new solution to include another column which shows the second largest consecutive run of "0". My sample data and code is below, the function is operating on the month columns and the second largest run column is what I hope to add. I am working with a large dataset so the more efficient the better, any ideas are appreciated, thanks.
sample data
structure(list(ID = c(1, 2, 3, 4, 5, 6, 7, 8, 9), V1 = c("A",
"B", "A", "B", "B", "A", "A", "B", "B"), V2 = c(21, 233, 185,
85, 208, 112, 238, 66, 38), V3 = c(149, 250, 218, 104, 62, 19,
175, 168, 28), Jan = c(10, 20, 10, 12, 76, 28, 137, 162, 101),
Feb = c(20, 25, 15, 0, 89, 0, 152, 177, 119), March = c(0,
28, 20, 14, 108, 0, 165, 194, 132), April = c(0, 34, 25,
16, 125, 71, 181, 208, 149), May = c(25, 0, 30, 22, 135,
0, 191, 224, 169), June = c(29, 0, 35, 24, 145, 0, 205, 244,
187), July = c(34, 0, 40, 28, 163, 0, 217, 256, 207), August = c(37,
0, 45, 29, 173, 0, 228, 276, 221), Sep = c(0, 39, 50, 31,
193, 0, 239, 308, 236), Oct = c(0, 48, 55, 35, 210, 163,
252, 0, 247), Nov = c(48, 55, 60, 40, 221, 183, 272, 0, 264
), Dec = c(50, 60, 65, 45, 239, 195, 289, 0, 277), `Second largest run` = c(1,
NA, NA, NA, NA, 2, NA, NA, NA), result = c(2, 4, -Inf, 1,
-Inf, 5, -Inf, 3, -Inf)), row.names = c(NA, -9L), class = c("tbl_df",
"tbl", "data.frame"))
code
most_consecutive_val = function(x, val = 0) {
with(rle(x), max(lengths[values == val]))
}
test$result=apply(test[,-c(1:4,17)], MARGIN = 1, most_consecutive_val)
Rather than taking the max from the run length encoding (rle) function, we want to sort the output and then extract the desired index. We'll get NA's when we request an index that doesn't exist -- where there isn't a second run of zeroes in row 2 for example.
ordered_runs = function(x, val = 0, idx = 1) {
with(rle(x), sort(lengths[values == val], decreasing = TRUE))[idx]
}
test$result_1 <- apply(test[,-c(1:4,17:18)], MARGIN = 1, ordered_runs, idx = 1)
test$result_2 <- apply(test[,-c(1:4,17:18)], MARGIN = 1, ordered_runs, idx = 2)
Output is slightly different than your expected -- (1) using NA's rather than -Inf, and (2) in your first row, where I believe there is a tie with a second run of 2 zeroes.
> test[,c(1,17:20)]
# A tibble: 9 x 5
ID `Second largest run` result result_1 result_2
<dbl> <dbl> <dbl> <int> <int>
1 1 1 2 2 2
2 2 NA 4 4 NA
3 3 NA -Inf NA NA
4 4 NA 1 1 NA
5 5 NA -Inf NA NA
6 6 2 5 5 2
7 7 NA -Inf NA NA
8 8 NA 3 3 NA
9 9 NA -Inf NA NA
Here is an option using data.table which should be quite fast for OP's large dataset and also identifies all sequences of zeros simultaneously:
library(data.table)
setDT(DF)
cols <- c("Jan", "Feb", "March", "April", "May", "June", "July", "August", "Sep", "Oct", "Nov", "Dec")
#convert into a long format
m <- melt(DF, measure.vars=cols)[
#identify consecutive sequences of the same number and count
order(ID), c("rl", "rw") := .(rl <- rleid(ID, value), rowid(rl))][
#extract the last element where values = 0 (that is the length of sequences of zeros)
value == 0L, .(ID=ID[.N], len=rw[.N]), rl][
#sort in descending order for length of sequences
order(ID, -len)]
#pivot into wide format and perform a update join
wide <- dcast(m, ID ~ rowid(ID), value.var="len")
DF[wide, on=.(ID), (names(wide)) := mget(names(wide))]
output:
ID V1 V2 V3 Jan Feb March April May June July August Sep Oct Nov Dec 1 2
1: 1 A 21 149 10 20 0 0 25 29 34 37 0 0 48 50 2 2
2: 2 B 233 250 20 25 28 34 0 0 0 0 39 48 55 60 4 NA
3: 3 A 185 218 10 15 20 25 30 35 40 45 50 55 60 65 NA NA
4: 4 B 85 104 12 0 14 16 22 24 28 29 31 35 40 45 1 NA
5: 5 B 208 62 76 89 108 125 135 145 163 173 193 210 221 239 NA NA
6: 6 A 112 19 28 0 0 71 0 0 0 0 0 163 183 195 5 2
7: 7 A 238 175 137 152 165 181 191 205 217 228 239 252 272 289 NA NA
8: 8 B 66 168 162 177 194 208 224 244 256 276 308 0 0 0 3 NA
9: 9 B 38 28 101 119 132 149 169 187 207 221 236 247 264 277 NA NA
data:
DF <- structure(list(ID = c(1, 2, 3, 4, 5, 6, 7, 8, 9), V1 = c("A",
"B", "A", "B", "B", "A", "A", "B", "B"), V2 = c(21, 233, 185,
85, 208, 112, 238, 66, 38), V3 = c(149, 250, 218, 104, 62, 19,
175, 168, 28), Jan = c(10, 20, 10, 12, 76, 28, 137, 162, 101),
Feb = c(20, 25, 15, 0, 89, 0, 152, 177, 119), March = c(0,
28, 20, 14, 108, 0, 165, 194, 132), April = c(0, 34, 25,
16, 125, 71, 181, 208, 149), May = c(25, 0, 30, 22, 135,
0, 191, 224, 169), June = c(29, 0, 35, 24, 145, 0, 205, 244,
187), July = c(34, 0, 40, 28, 163, 0, 217, 256, 207), August = c(37,
0, 45, 29, 173, 0, 228, 276, 221), Sep = c(0, 39, 50, 31,
193, 0, 239, 308, 236), Oct = c(0, 48, 55, 35, 210, 163,
252, 0, 247), Nov = c(48, 55, 60, 40, 221, 183, 272, 0, 264
), Dec = c(50, 60, 65, 45, 239, 195, 289, 0, 277), `1` = c(2L,
4L, NA, 1L, NA, 5L, NA, 3L, NA), `2` = c(2L, NA, NA, NA,
NA, 2L, NA, NA, NA)), row.names = c(NA, -9L), class = "data.frame")

Resources