Automatic extraction of p-value from data.frame - r

I want to compare protein expression values (n=465 proteins) for two groups of patients (resistant vs. sensitive).
I have 11 resistant patients and 8 sensitive patients. I would like to compare (ttest) expression values of protein 1 of the resistant group (A res to K res) with that of the sensitive group (L sens to S sens), protein 2 (resistant) with protein 2 (sensitive), and so on. As an output I want only the proteins where the p-value is <0.05.
I tried to do this (see below), but there is something wrong and I can not figure out what.
X Protein.1 Protein.2 Protein.3 Protein.4 Protein.5 Protein.6
1 A res 4127 16886 1785 1636 407 135
2 B res 10039 32414 3144 1543 601 154
3 C res 527 1059 1637 317 229 107
4 D res 553 3848 7357 1168 1549 441
5 E res 2351 2272 5868 2606 517 159
6 F res 822 1767 2110 818 293 75
7 G res 673 1887 511 471 214 NA
8 H res 5769 2206 2041 517 355 298
9 I res 1660 4221 1921 629 383 104
10 J res 3281 1804 2400 225 268 52
11 K res 3383 1882 1935 185 NA NA
12 L sens 10810 20136 2350 1143 527 160
13 M sens 5941 14873 3550 943 308 NA
14 N sens 1100 2325 1359 561 542 284
15 O sens 85 587 619 364 85 52
16 P sens 2321 6335 6494 994 NA NA
17 Q sens 103810 7102 7986 1464 439 187
18 R sens 1174 2076 1423 340 186 70
19 S sens 1829 973 1343 380 453 221
data <- read.csv("ProteinDataResSens.csv", sep=";", na.strings="weak", header=TRUE)
res <- data.frame(data[1:11, ], row.names=NULL)
colnames(res) <- paste("res", 1:length(res), sep="_")
sens <- data.frame(data[12:19, ], row.names=NULL)
colnames(sens) <- paste("sens", 1:length(sens), sep="_")
com <- combn(c(colnames(res), colnames(sens)), 2)
p <- apply(com, 2, function(x) t.test(data[, x[1]], data[, x[2]])$p.val)
data.frame(comparison=paste(com[1, ], com[2, ],sep=" vs."), p.value=p)
Thank you very much for any help!

If you want to compare the res against sens for each Protein columns
grp <- sub(".* ", "", df$X)
Pvals <- mapply(function(x,y) t.test(x[grp=='res'],
x[grp=='sens'])$p.value, df[,-1], list(grp))
Pvals[Pvals < 0.05]
Or using data.table
library(data.table)
setDT(df)[, grp:= sub('.* ', "", X)][, lapply(.SD,
function(x) t.test(x[grp=='res'], x[grp=='sens'])$p.value),
.SDcols=2:(ncol(df)-1)]
data
df <- structure(list(X = c("A res", "B res", "C res", "D res", "E res",
"F res", "G res", "H res", "I res", "J res", "K res", "L sens",
"M sens", "N sens", "O sens", "P sens", "Q sens", "R sens", "S sens"
), Protein.1 = c(4127L, 10039L, 527L, 553L, 2351L, 822L, 673L,
5769L, 1660L, 3281L, 3383L, 10810L, 5941L, 1100L, 85L, 2321L,
103810L, 1174L, 1829L), Protein.2 = c(16886L, 32414L, 1059L,
3848L, 2272L, 1767L, 1887L, 2206L, 4221L, 1804L, 1882L, 20136L,
14873L, 2325L, 587L, 6335L, 7102L, 2076L, 973L), Protein.3 = c(1785L,
3144L, 1637L, 7357L, 5868L, 2110L, 511L, 2041L, 1921L, 2400L,
1935L, 2350L, 3550L, 1359L, 619L, 6494L, 7986L, 1423L, 1343L),
Protein.4 = c(1636L, 1543L, 317L, 1168L, 2606L, 818L, 471L,
517L, 629L, 225L, 185L, 1143L, 943L, 561L, 364L, 994L, 1464L,
340L, 380L), Protein.5 = c(407L, 601L, 229L, 1549L, 517L,
293L, 214L, 355L, 383L, 268L, NA, 527L, 308L, 542L, 85L,
NA, 439L, 186L, 453L), Protein.6 = c(135L, 154L, 107L, 441L,
159L, 75L, NA, 298L, 104L, 52L, NA, 160L, NA, 284L, 52L,
NA, 187L, 70L, 221L)), .Names = c("X", "Protein.1", "Protein.2",
"Protein.3", "Protein.4", "Protein.5", "Protein.6"), class =
"data.frame", row.names = c("1", "2", "3", "4", "5", "6", "7", "8",
"9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19"))

Related

Rowwise proportion test and add p value as new column

My data:
c5 =structure(list(comorbid = c("heart", "ihd", "cabg", "angio",
"cerebrovasc", "diabetes", "pvd", "amputation", "liver", "malig",
"smoke", "ulcers"), AVF_Y = c(626L, 355L, 266L, 92L, 320L, 1175L,
199L, 89L, 75L, 450L, 901L, 114L), AVG_Y = c(54L, 14L, 18L, 5L,
21L, 37L, 5L, 7L, 5L, 29L, 33L, 3L), AVF_tot = c(2755L, 1768L,
2770L, 2831L, 2844L, 2877L, 1745L, 2823L, 2831L, 2823L, 2798L,
2829L), AVG_tot = c(161L, 61L, 161L, 165L, 166L, 167L, 61L, 165L,
165L, 165L, 159L, 164L)), row.names = c(NA, -12L), class = "data.frame")
I want to perform a prop.test for each row ( a two-proportions z-test) and add the p value as a new column.
I've tried using the following code, but this gives me 24 1-sample proportions test results instead of 12 2-sample test for equality of proportions.
Map(prop.test, x = c(c5$AVF_Y, c5$AVG_Y), n = c(c5$AVF_tot, c5$AVG_tot))
Use a lambda function and extract. When we concatenate the columns, it returns a vector and its length will be 2 times the number of rows of the data. We would need to concatenate within in the loop to create a vector of length 2 for each x and n from corresponding columns of '_Y', and '_tot'
mapply(function(avf, avg, avf_n, avg_n) prop.test(c(avf, avg), c(avf_n, avg_n))$p.value, c5$AVF_Y, c5$AVG_Y, c5$AVF_tot, c5$AVG_tot)
-output
[1] 2.218376e-03 6.985883e-01 6.026012e-01 1.000000e+00 6.695440e-01 2.425781e-06 5.672322e-01 5.861097e-01 9.627050e-01 6.546286e-01 3.360300e-03 2.276857e-0
Or use do.cal with Map or mapply
do.call(mapply, c(FUN = function(x, y, n1, n2)
prop.test(c(x, y), c(n1, n2))$p.value, unname(c5[-1])))
[1] 2.218376e-03 6.985883e-01 6.026012e-01 1.000000e+00 6.695440e-01 2.425781e-06 5.672322e-01 5.861097e-01 9.627050e-01 6.546286e-01 3.360300e-03 2.276857e-01
Or with apply
apply(c5[-1], 1, function(x) prop.test(x[1:2], x[3:4])$p.value)
[1] 2.218376e-03 6.985883e-01 6.026012e-01 1.000000e+00 6.695440e-01 2.425781e-06 5.672322e-01 5.861097e-01 9.627050e-01 6.546286e-01 3.360300e-03 2.276857e-01
Or use rowwise
library(dplyr)
c5 %>%
rowwise %>%
mutate(pval = prop.test(c(AVF_Y, AVG_Y),
n = c(AVF_tot, AVG_tot))$p.value) %>%
ungroup
-output
# A tibble: 12 × 6
comorbid AVF_Y AVG_Y AVF_tot AVG_tot pval
<chr> <int> <int> <int> <int> <dbl>
1 heart 626 54 2755 161 0.00222
2 ihd 355 14 1768 61 0.699
3 cabg 266 18 2770 161 0.603
4 angio 92 5 2831 165 1.00
5 cerebrovasc 320 21 2844 166 0.670
6 diabetes 1175 37 2877 167 0.00000243
7 pvd 199 5 1745 61 0.567
8 amputation 89 7 2823 165 0.586
9 liver 75 5 2831 165 0.963
10 malig 450 29 2823 165 0.655
11 smoke 901 33 2798 159 0.00336
12 ulcers 114 3 2829 164 0.228

Sorting a column in a data frame in R

*I wanted to arrange the column "TotalConfirmedCases" in descending order but it sorted in a weird way like 965 is arranged first.
CODE in R: new_Cor_table[rev(order(new_Cor_table$TotalConfirmedCases)),]
Output:
Update: thanks to input of #onyambu:
We could use order with decreasing=TRUE:
newdata <- df[order(df$TotalConfirmedCases, decreasing = TRUE),]
OR
If we want to do it with rev then here is the syntax:
newdata <- df[rev(order(df$TotalConfirmedCases)),]
newdata
County TotalConfirmedCases Totalprobablecases Totalcases Totaldeaths
3 Dakota 95277 23,252 118,529 792
7 Anoka 83623 20,459 104,082 808
26 Washington 57910 14,193 72,103 490
30 Stearns 50672 2,622 53,294 372
34 Olmsted 44718 1,048 45,766 191
36 St. Louis 43103 8,153 51,256 541
2 Douglas 9534 1,962 11,496 118
5 Isanti 8892 1,645 10,537 119
4 Morrison 8892 616 9,508 105
6 Freeborn 8753 679 9,432 77
8 Nicollet 8244 385 8,629 66
9 Becker 7877 1,292 9,169 95
11 Polk 7319 1,852 9,171 109
12 Carlton 7203 2,451 9,654 100
13 Mille Lacs 6962 578 7,540 116
15 Cass 6687 668 7,355 83
16 Todd 6605 486 7,091 61
17 Lyon 6503 759 7,262 74
18 Brown 6460 330 6,790 81
19 Le Sueur 6294 449 6,743 51
21 Pine 6141 1,319 7,460 68
22 Nobles 6025 1,044 7,069 60
23 Dodge 5916 144 6,060 22
24 Meeker 5803 361 6,164 75
25 Wabasha 5795 172 5,967 19
28 Waseca 5314 424 5,738 39
29 Martin 5273 549 5,822 65
31 Fillmore 4953 117 5,070 24
32 Hubbard 4579 556 5,135 60
33 Houston 4498 320 4,818 20
35 Roseau 4327 281 4,608 45
37 Faribault 3759 213 3,972 54
38 Redwood 3661 417 4,078 54
39 Wadena 3636 754 4,390 56
1 Kittson 965 109 1,074 28
10 Lake\tof the Woods 771 34 805 6
14 Red Lake 692 269 961 13
20 Cook 620 12 632 4
27 Traverse 577 313 890 10
>
data:
structure(list(County = c("Kittson", "Douglas", "Dakota", "Morrison",
"Isanti", "Freeborn", "Anoka", "Nicollet", "Becker", "Lake\tof the Woods",
"Polk", "Carlton", "Mille Lacs", "Red Lake", "Cass", "Todd",
"Lyon", "Brown", "Le Sueur", "Cook", "Pine", "Nobles", "Dodge",
"Meeker", "Wabasha", "Washington", "Traverse", "Waseca", "Martin",
"Stearns", "Fillmore", "Hubbard", "Houston", "Olmsted", "Roseau",
"St. Louis", "Faribault", "Redwood", "Wadena"), TotalConfirmedCases = c(965L,
9534L, 95277L, 8892L, 8892L, 8753L, 83623L, 8244L, 7877L, 771L,
7319L, 7203L, 6962L, 692L, 6687L, 6605L, 6503L, 6460L, 6294L,
620L, 6141L, 6025L, 5916L, 5803L, 5795L, 57910L, 577L, 5314L,
5273L, 50672L, 4953L, 4579L, 4498L, 44718L, 4327L, 43103L, 3759L,
3661L, 3636L), Totalprobablecases = c("109", "1,962", "23,252",
"616", "1,645", "679", "20,459", "385", "1,292", "34", "1,852",
"2,451", "578", "269", "668", "486", "759", "330", "449", "12",
"1,319", "1,044", "144", "361", "172", "14,193", "313", "424",
"549", "2,622", "117", "556", "320", "1,048", "281", "8,153",
"213", "417", "754"), Totalcases = c("1,074", "11,496", "118,529",
"9,508", "10,537", "9,432", "104,082", "8,629", "9,169", "805",
"9,171", "9,654", "7,540", "961", "7,355", "7,091", "7,262",
"6,790", "6,743", "632", "7,460", "7,069", "6,060", "6,164",
"5,967", "72,103", "890", "5,738", "5,822", "53,294", "5,070",
"5,135", "4,818", "45,766", "4,608", "51,256", "3,972", "4,078",
"4,390"), Totaldeaths = c(28L, 118L, 792L, 105L, 119L, 77L, 808L,
66L, 95L, 6L, 109L, 100L, 116L, 13L, 83L, 61L, 74L, 81L, 51L,
4L, 68L, 60L, 22L, 75L, 19L, 490L, 10L, 39L, 65L, 372L, 24L,
60L, 20L, 191L, 45L, 541L, 54L, 54L, 56L)), class = "data.frame", row.names = c(NA,
-39L))
I suggest using the rank function, with a negative sign it will reverse the order
new_Cor_table[order (-rank (new_Cor_table$TotalConfirmedCases)),]

Filter a dataframe based on a shorter dataframe

I have the following dataframes I've simplified for practical purposes:
head(coords_int)
seqnames start end
1 chr2 181529780 181533313
2 chr2 98396674 98396940
3 chr5 64919375 64919395
4 chr9 2795948 2797647
5 chr7 138873527 138873574
6 chr4 154736072 154736138
7 chr6 10762723 10769212
8 chr10 93614727 93614773
9 chr17 76539181 76539300
10 chr6 99608741 99608872
11 chr10 47330654 47330828
12 chr10 47331176 47331237
13 chr10 93612154 93612575
14 chr10 84248910 84249043
15 chr17 28547999 28548101
16 chr17 28548592 28548705
17 chr11 46701067 46701141
18 chr16 49847678 49847918
19 chr16 49822670 49822738
head(pdoms_protein)
tx_id seqnames start end width strand exon_id exon_rank cds_ok
1 ENST00000339098 2 181573753 181573876 124 - ENSE00003634697 3 TRUE
2 ENST00000339098 2 181573753 181573876 124 - ENSE00003634697 3 TRUE
3 ENST00000339098 2 181566058 181566121 64 - ENSE00003523731 4 TRUE
4 ENST00000393504 2 98395844 98396397 554 + ENSE00000963920 8 TRUE
5 ENST00000393504 2 98395844 98396397 554 + ENSE00000963920 8 TRUE
6 ENST00000393504 2 98396674 98396940 267 + ENSE00000963920 8 TRUE
7 ENST00000381070 5 64774694 64774787 94 + ENSE00003522928 2 TRUE
8 ENST00000381070 5 64774694 64774787 94 + ENSE00003522928 2 TRUE
9 ENST00000381070 5 64774694 64774787 94 + ENSE00003522928 2 TRUE
10 ENST00000381070 5 64781921 64782033 113 + ENSE00003582136 3 TRUE
11 ENST00000381070 5 64781921 64782033 113 + ENSE00003582136 3 TRUE
12 ENST00000382082 9 2718229 2718276 48 + ENSE00001490869 1 TRUE
13 ENST00000382082 9 2718229 2718276 48 + ENSE00001490869 1 TRUE
14 ENST00000422774 7 138881388 138881584 197 - ENSE00001088065 11 TRUE
15 ENST00000422774 7 138879538 138879653 116 - ENSE00001088074 12 TRUE
16 ENST00000422774 7 138871157 138871362 206 - ENSE00001088067 13 TRUE
17 ENST00000336356 4 154744456 154744845 390 + ENSE00001344788 2 TRUE
18 ENST00000502525 4 154744456 154744530 75 + ENSE00002048458 4 FALSE
19 ENST00000507827 4 154744456 154744845 390 + ENSE00001344788 2 TRUE
20 ENST00000313243 6 10830548 10830639 92 - ENSE00003696993 2 TRUE
21 ENST00000313243 6 10830548 10830639 92 - ENSE00003696993 2 TRUE
22 ENST00000313243 6 10830548 10830639 92 - ENSE00003696993 2 TRUE
23 ENST00000313243 6 10830548 10830639 92 - ENSE00003696993 2 TRUE
protein_start protein_end protein_domain_id protein_domain_source interpro_accession
1 164 339 PS50146 pfscan IPR001206
2 164 339 PF00781 pfam IPR001206
3 164 339 PS50146 pfscan IPR001206
4 171 409 PF16526 pfam IPR032406
5 171 409 SM00100 smart IPR000595
6 502 590 PS50042 pfscan IPR000595
7 16 166 PR00153 prints IPR002130
8 16 166 PR00153 prints IPR002130
9 16 166 PR00153 prints IPR002130
10 16 166 PS50072 pfscan IPR002130
11 16 166 PS00170 scanprosite IPR020892
12 164 179 PR01494 prints IPR003971
13 164 179 PR01491 prints IPR003968
14 1039 1702 PF12877 pfam IPR024606
15 1039 1702 PF12877 pfam IPR024606
16 1039 1702 PF12877 pfam IPR024606
17 44 173 PF04970 pfam IPR007053
18 44 68 PF04970 pfam IPR007053
19 44 173 PF04970 pfam IPR007053
20 4 284 PS50011 pfscan IPR000719
21 4 284 PS00107 scanprosite IPR017441
22 4 284 PS00108 scanprosite IPR008271
23 4 284 SSF56112 superfamily IPR011009
prot_dom_start prot_dom_end gene_name
1 164 339 CERKL
2 170 334 CERKL
3 164 339 CERKL
4 598 668 CNGA3
5 482 606 CNGA3
6 482 596 CNGA3
7 125 140 CWC27
8 97 112 CWC27
9 112 124 CWC27
10 19 166 CWC27
11 49 66 CWC27
12 187 199 KCNV2
13 410 424 KCNV2
14 1039 1702 KIAA1549
15 1039 1702 KIAA1549
16 1039 1702 KIAA1549
17 44 173 LRAT
18 44 68 LRAT
19 44 173 LRAT
20 4 284 MAK
21 10 33 MAK
22 121 133 MAK
23 1 285 MAK
I would like to know if any of the coords_int$start are part of the pdoms_protein$start / pdoms_protein$end range and the same for the coords_int$end and then filter only the data that falls in this category.
I'd tried
library(tidyverse)
pdoms_protein %>%
mutate(dom.ok = 98396674>= start & 98396674<= end) %>%
filter(dom.ok == "TRUE")
And it works but only for one value at a time. Is there a more practical way to do it all at once?
We could do it with fuzzyjoin:
library(fuzzyjoin)
library(dplyr)
long_coords_int <- coords_int %>%
pivot_longer(-seqnames)
fuzzy_left_join(long_coords_int, pdoms_protein[3:4], by = c("value" = "start", "value" = "end"),
match_fun =list(`>=`, `<=`)) %>%
mutate(found = c(NA, "YES")[(!is.na(start)) + 1])
seqnames name value start end found
<chr> <chr> <int> <int> <int> <chr>
1 chr2 start 181529780 NA NA NA
2 chr2 end 181533313 NA NA NA
3 chr2 start 98396674 98396674 98396940 YES
4 chr2 end 98396940 98396674 98396940 YES
5 chr5 start 64919375 NA NA NA
6 chr5 end 64919395 NA NA NA
7 chr9 start 2795948 NA NA NA
8 chr9 end 2797647 NA NA NA
9 chr7 start 138873527 NA NA NA
10 chr7 end 138873574 NA NA NA
# ... with 28 more rows
coords_int <- structure(list(seqnames = c("chr2", "chr2", "chr5", "chr9", "chr7",
"chr4", "chr6", "chr10", "chr17", "chr6", "chr10", "chr10", "chr10",
"chr10", "chr17", "chr17", "chr11", "chr16", "chr16"), start = c(181529780L,
98396674L, 64919375L, 2795948L, 138873527L, 154736072L, 10762723L,
93614727L, 76539181L, 99608741L, 47330654L, 47331176L, 93612154L,
84248910L, 28547999L, 28548592L, 46701067L, 49847678L, 49822670L
), end = c(181533313L, 98396940L, 64919395L, 2797647L, 138873574L,
154736138L, 10769212L, 93614773L, 76539300L, 99608872L, 47330828L,
47331237L, 93612575L, 84249043L, 28548101L, 28548705L, 46701141L,
49847918L, 49822738L)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19"))
pdoms_protein <- structure(list(tx_id = c("ENST00000339098", "ENST00000339098",
"ENST00000339098", "ENST00000393504", "ENST00000393504", "ENST00000393504",
"ENST00000381070", "ENST00000381070", "ENST00000381070", "ENST00000381070",
"ENST00000381070", "ENST00000382082", "ENST00000382082", "ENST00000422774",
"ENST00000422774", "ENST00000422774", "ENST00000336356", "ENST00000502525",
"ENST00000507827", "ENST00000313243", "ENST00000313243", "ENST00000313243",
"ENST00000313243"), seqnames = c(2L, 2L, 2L, 2L, 2L, 2L, 5L,
5L, 5L, 5L, 5L, 9L, 9L, 7L, 7L, 7L, 4L, 4L, 4L, 6L, 6L, 6L, 6L
), start = c(181573753L, 181573753L, 181566058L, 98395844L, 98395844L,
98396674L, 64774694L, 64774694L, 64774694L, 64781921L, 64781921L,
2718229L, 2718229L, 138881388L, 138879538L, 138871157L, 154744456L,
154744456L, 154744456L, 10830548L, 10830548L, 10830548L, 10830548L
), end = c(181573876L, 181573876L, 181566121L, 98396397L, 98396397L,
98396940L, 64774787L, 64774787L, 64774787L, 64782033L, 64782033L,
2718276L, 2718276L, 138881584L, 138879653L, 138871362L, 154744845L,
154744530L, 154744845L, 10830639L, 10830639L, 10830639L, 10830639L
), width = c(124L, 124L, 64L, 554L, 554L, 267L, 94L, 94L, 94L,
113L, 113L, 48L, 48L, 197L, 116L, 206L, 390L, 75L, 390L, 92L,
92L, 92L, 92L), strand = c("-", "-", "-", "+", "+", "+", "+",
"+", "+", "+", "+", "+", "+", "-", "-", "-", "+", "+", "+", "-",
"-", "-", "-"), exon_id = c("ENSE00003634697", "ENSE00003634697",
"ENSE00003523731", "ENSE00000963920", "ENSE00000963920", "ENSE00000963920",
"ENSE00003522928", "ENSE00003522928", "ENSE00003522928", "ENSE00003582136",
"ENSE00003582136", "ENSE00001490869", "ENSE00001490869", "ENSE00001088065",
"ENSE00001088074", "ENSE00001088067", "ENSE00001344788", "ENSE00002048458",
"ENSE00001344788", "ENSE00003696993", "ENSE00003696993", "ENSE00003696993",
"ENSE00003696993"), exon_rank = c(3L, 3L, 4L, 8L, 8L, 8L, 2L,
2L, 2L, 3L, 3L, 1L, 1L, 11L, 12L, 13L, 2L, 4L, 2L, 2L, 2L, 2L,
2L), cds_ok = c(TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE,
TRUE, TRUE, TRUE, TRUE, TRUE), protein_start = c(164L, 164L,
164L, 171L, 171L, 502L, 16L, 16L, 16L, 16L, 16L, 164L, 164L,
1039L, 1039L, 1039L, 44L, 44L, 44L, 4L, 4L, 4L, 4L), protein_end = c(339L,
339L, 339L, 409L, 409L, 590L, 166L, 166L, 166L, 166L, 166L, 179L,
179L, 1702L, 1702L, 1702L, 173L, 68L, 173L, 284L, 284L, 284L,
284L), protein_domain_id = c("PS50146", "PF00781", "PS50146",
"PF16526", "SM00100", "PS50042", "PR00153", "PR00153", "PR00153",
"PS50072", "PS00170", "PR01494", "PR01491", "PF12877", "PF12877",
"PF12877", "PF04970", "PF04970", "PF04970", "PS50011", "PS00107",
"PS00108", "SSF56112"), protein_domain_source = c("pfscan", "pfam",
"pfscan", "pfam", "smart", "pfscan", "prints", "prints", "prints",
"pfscan", "scanprosite", "prints", "prints", "pfam", "pfam",
"pfam", "pfam", "pfam", "pfam", "pfscan", "scanprosite", "scanprosite",
"superfamily"), interpro_accession = c("IPR001206", "IPR001206",
"IPR001206", "IPR032406", "IPR000595", "IPR000595", "IPR002130",
"IPR002130", "IPR002130", "IPR002130", "IPR020892", "IPR003971",
"IPR003968", "IPR024606", "IPR024606", "IPR024606", "IPR007053",
"IPR007053", "IPR007053", "IPR000719", "IPR017441", "IPR008271",
"IPR011009"), prot_dom_start = c(164L, 170L, 164L, 598L, 482L,
482L, 125L, 97L, 112L, 19L, 49L, 187L, 410L, 1039L, 1039L, 1039L,
44L, 44L, 44L, 4L, 10L, 121L, 1L), prot_dom_end = c(339L, 334L,
339L, 668L, 606L, 596L, 140L, 112L, 124L, 166L, 66L, 199L, 424L,
1702L, 1702L, 1702L, 173L, 68L, 173L, 284L, 33L, 133L, 285L),
gene_name = c("CERKL", "CERKL", "CERKL", "CNGA3", "CNGA3",
"CNGA3", "CWC27", "CWC27", "CWC27", "CWC27", "CWC27", "KCNV2",
"KCNV2", "KIAA1549", "KIAA1549", "KIAA1549", "LRAT", "LRAT",
"LRAT", "MAK", "MAK", "MAK", "MAK")), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "20", "21", "22", "23"))
You could use data.table::foverlaps(), like this:
library(data.table)
setDT(coords_int)
setDT(pdoms_protein)
setkey(coords_int,start,end)
foverlaps(pdoms_protein,coords_int)
Also see package IRanges
Not tested, but I think you could make a function like this
pdoms_protein %>%
mutate(dom.ok = isok(start,end)) %>%
filter(dom.ok == "TRUE")
isok <- function(local_start, local_end) {
df <- coords_int %>%
filter(start >= local_start & end <= local_end)
return count(df) > 0
}

Calculate average based on columns in 2 datafarmes and their values via mutate in R?

I have a dataframe structure that calculates the sum of Response.Status found per month with this mutate function:
DF1 <- complete_df %>%
mutate(Month = format(as.Date(date, format = "%Y/%m/%d"), "%m/%Y"),
UNSUBSCRIBE = if_else(UNSUBSCRIBE == "TRUE", "UNSUBSCRIBE", NA_character_)) %>%
pivot_longer(c(Response.Status, UNSUBSCRIBE), values_to = "Response.Status") %>%
drop_na() %>%
count(Month, Response.Status) %>%
pivot_wider(names_from = Month, names_sep = "/", values_from = n)
# A tibble: 7 x 16
Response.Status `01/2020` `02/2020` `03/2020` `04/2020` `05/2020` `06/2020` `07/2020` `08/2020` `09/2019` `09/2020` `10/2019` `10/2020` `11/2019` `11/2020` `12/2019`
<chr> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1 EMAIL_OPENED 1068 3105 4063 4976 2079 1856 4249 3638 882 4140 865 2573 1167 684 862
2 NOT_RESPONDED 3187 9715 13164 15239 5458 4773 12679 10709 2798 15066 2814 8068 3641 1931 2647
3 PARTIALLY_SAVED 5 34 56 8 28 22 73 86 11 14 7 23 8 8 2
4 SUBMITTED 216 557 838 828 357 310 654 621 214 1001 233 497 264 122 194
5 SURVEY_OPENED 164 395 597 1016 245 212 513 625 110 588 123 349 202 94 120
6 UNDELIVERED_OR_BOUNCED 92 280 318 260 109 127 319 321 63 445 69 192 93 39 74
7 UNSUBSCRIBE 397 1011 1472 1568 727 737 1745 2189 372 1451 378 941 429 254 355
What I would like to do is take those values created in table to calculate average based on # of people in each Response.Status group.
structure(list(Response.Status = c("EMAIL_OPENED", "NOT_RESPONDED",
"PARTIALLY_SAVED", "SUBMITTED", "SURVEY_OPENED", "UNDELIVERED_OR_BOUNCED"
), `01/2020` = c(1068L, 3187L, 5L, 216L, 164L, 92L), `02/2020` = c(3105L,
9715L, 34L, 557L, 395L, 280L), `03/2020` = c(4063L, 13164L, 56L,
838L, 597L, 318L), `04/2020` = c(4976L, 15239L, 8L, 828L, 1016L,
260L), `05/2020` = c(2079L, 5458L, 28L, 357L, 245L, 109L), `06/2020` = c(1856L,
4773L, 22L, 310L, 212L, 127L), `07/2020` = c(4249L, 12679L, 73L,
654L, 513L, 319L), `08/2020` = c(3638L, 10709L, 86L, 621L, 625L,
321L), `09/2019` = c(882L, 2798L, 11L, 214L, 110L, 63L), `09/2020` = c(4140L,
15066L, 14L, 1001L, 588L, 445L), `10/2019` = c(865L, 2814L, 7L,
233L, 123L, 69L), `10/2020` = c(2573L, 8068L, 23L, 497L, 349L,
192L), `11/2019` = c(1167L, 3641L, 8L, 264L, 202L, 93L), `11/2020` = c(684L,
1931L, 8L, 122L, 94L, 39L), `12/2019` = c(862L, 2647L, 2L, 194L,
120L, 74L)), row.names = c(NA, -6L), class = c("tbl_df", "tbl",
"data.frame"))
I made a separate table that contains sum values based on those group names:
Response.Status
EMAIL_OPENED : 451
NOT_RESPONDED : 1563
PARTIALLY_SAVED : 4
SUBMITTED : 71
SURVEY_OPENED : 53
UNDELIVERED_OR_BOUNCED: 47
UNSUBSCRIBE: 135
If I understood your problem correctly you have 2 data.frame/tibbles. One that is shown in the "structure" part an one that informs the quantity of people/users per response status. Now you want to get the value per person. If so this is a possible solution:
# people/users data set
df2 <- data.frame(Response.Status = c("EMAIL_OPENED", "NOT_RESPONDED", "PARTIALLY_SAVED", "SUBMITTED", "SURVEY_OPENED", "UNDELIVERED_OR_BOUNCED", "UNSUBSCRIBE"),
PEOPLE = c(451, 1563, 4, 71, 53, 47, 135))
df %>% # this is your "structure"
tidyr::pivot_longer(-Response.Status, names_to = "DATE", values_to = "nmbr") %>%
dplyr::group_by(Response.Status) %>%
dplyr::summarise(SUM = sum(nmbr)) %>%
dplyr::inner_join(df2) %>%
dplyr::mutate(MEAN_PP = SUM / PEOPLE)
Response.Status SUM PEOPLE MEAN_PP
<chr> <int> <dbl> <dbl>
1 EMAIL_OPENED 36207 451 80.3
2 NOT_RESPONDED 111889 1563 71.6
3 PARTIALLY_SAVED 385 4 96.2
4 SUBMITTED 6906 71 97.3
5 SURVEY_OPENED 5353 53 101
6 UNDELIVERED_OR_BOUNCED 2801 47 59.6

Vectorizing a for-loop that eliminates duplicate data in dataframe R

I am working with a difficult data manipulation question in R. I am currently using a for-loop to approach the problem, however I would like to vectorize this to have it scale better. I have the following dataframe to work with:
dput(mydf)
structure(list(team_id = c(14L, 14L, 7L, 7L, 21L, 21L, 15L, 15L
), opp_team_id = c(7L, 7L, 14L, 14L, 15L, 15L, 21L, 21L), pg = c(3211L,
3211L, 786L, 786L, 3914L, 644L, 1524L, 593L), sg = c(653L, 4122L,
1512L, 1512L, 2593L, 10L, 54L, 54L), sf = c(4122L, 1742L, 2347L,
2347L, 1352L, 3378L, 2843L, 1062L), pf = c(1742L, 886L, 79L,
1134L, 687L, 1352L, 1376L, 1376L), c = c(3014L, 2604L, 2960L,
2960L, 21L, 3216L, 1256L, 3017L), opp_pg = c(3982L, 3982L, 3211L,
4005L, 1524L, 1524L, 3914L, 644L), opp_sg = c(786L, 2347L, 653L,
653L, 54L, 802L, 2593L, 10L), opp_sf = c(1134L, 1134L, 4122L,
1742L, 1062L, 1062L, 3105L, 3105L), opp_pf = c(183L, 183L, 1742L,
886L, 3017L, 1376L, 3216L, 2135L), opp_c = c(2475L, 2960L, 3138L,
3138L, 1256L, 3017L, 21L, 1957L)), .Names = c("team_id", "opp_team_id",
"pg", "sg", "sf", "pf", "c", "opp_pg", "opp_sg", "opp_sf", "opp_pf",
"opp_c"), row.names = c(NA, -8L), class = "data.frame")
mydf
team_id opp_team_id pg sg sf pf c opp_pg opp_sg opp_sf opp_pf opp_c
1 14 7 3211 653 4122 1742 3014 3982 786 1134 183 2475
2 14 7 3211 4122 1742 886 2604 3982 2347 1134 183 2960
3 7 14 786 1512 2347 79 2960 3211 653 4122 1742 3138
4 7 14 786 1512 2347 1134 2960 4005 653 1742 886 3138
5 21 15 3914 2593 1352 687 21 1524 54 1062 3017 1256
6 21 15 644 10 3378 1352 3216 1524 802 1062 1376 3017
7 15 21 1524 54 2843 1376 1256 3914 2593 3105 3216 21
8 15 21 593 54 1062 1376 3017 644 10 3105 2135 1957
Based on my problem at hand, rows 3-4 and 7-8 are duplicates in this dataframe. Rows 3-4 are duplicates of rows 1-2, and rows 7-8 are duplicates on rows 5-6. This is sports data, and rows 3-4 are essentially rows 1 and 2 except with the team_id and opp_team_id switched, and the same for the other 10 columns (for the most part).
Here is my for-loop for removing duplicates, which I think is quite creative, but is a for-loop nonetheless:
indices = c(1)
TFSwitch = TRUE
for(i in 2:nrow(mydf)) {
last_row = mydf$team_id[(i-1)]
this_row = mydf$team_id[i]
TFSwitch = ifelse(last_row != this_row, !TFSwitch, TFSwitch)
if(TFSwitch == TRUE) {
indices = c(indices, i)
}
}
This for-loop goes back and forth checking if the teamID column changes from row to row, and if it does, it toggles TFSwitch from TRUE to FALSE, or vice versa. It then saves the indices I want to keep in a vector.
I would like to vectorize this - any thoughts would be greatly appreciated!
This is very similar to previous problems involving pairwise duplicate removal like: (pair-wise duplicate removal from dataframe). So following a similar procedure, and adding a little merge() back to get the indices, you can do:
vars <- c("team_id","opp_team_id")
mx <- do.call(pmax, mydf[vars])
mn <- do.call(pmin, mydf[vars])
merge(
cbind(mydf[vars], ind=seq_len(nrow(mydf))),
mydf[!duplicated(data.frame(mx,mn)), vars]
)[,"ind"]
# [1] 1 2 5 6
Here the same solution using data.table. My understating is that you want to remove duplicated by pairs not just finding unique indices.
library(data.table)
setDT(mydf)
mydf[,c("id1","id2"):=list(pmax(team_id,opp_team_id),pmin(team_id,opp_team_id))]
setkey(mydf,team_id,opp_team_id)[unique(mydf,by=c("id1","id2"))]

Resources