apply/create formula to manipulate two dataframes - r

I have two dataframes in R
df1
chr start end strand bam1 bam2 bam3 bam4 bam5 bam6 bam7 bam8
1 chr1 3531569 3531966 - 2 2 1 4 8 36 21 1
2 chr1 3670538 3672624 - 251 50 170 165 294 259 665 86
3 chr1 4491645 4493854 - 220 46 179 167 275 332 414 77
4 chr1 4496542 4497750 - 115 41 100 67 114 69 42 63
5 chr1 4571267 4572265 - 64 32 77 44 76 130 179 27
6 chr1 4688213 4688719 - 39 10 20 20 14 23 25 17
7 chr1 4688800 4688919 - 20 30 10 20 14 55 17 20
8 chr1 4688800 4688919 - 2 4 6 8 10 12 14 16
9 chr1 4688800 4688919 - 1 2 3 4 5 6 7 8
and
df2
bam_file r1 r2
1 bam1 2 1
2 bam2 9 3
3 bam3 1 4
4 bam4 1 5
5 bam5 1 1
6 bam6 8 6
7 bam7 3 7
8 bam8 3 2
I want to apply following formula (let say X), So that column of df2 X row of df1
((df2[1,2]-df1[1,5])ˆ2 + (df2[2,2]-df1[1,6])ˆ2 + (df2[3,2]-df1[1,7])ˆ2 + (df2[4,2]-df1[1,8])ˆ2 + (df2[5,2]-df1[1,9])ˆ2 + (df2[6,2]-df1[1,10])ˆ2 + (df2[7,2]-df1[1,11])ˆ2 +
(df2[8,2]-df1[1,12])ˆ2)/(ncol(df1)-4)
So the desired output will be
output
r1 r2
1 152.375 144.75
2 89140.25 88467.875
3 57822.75 57413.125
4 6195.125 6148
5 8007.375 7858.75
6 395.75 372.625
7 508.75 543.125
8 60.75 47.125
9 15.5 6.875
I apologize if this appears to be a repetitive question, but I tried and was unable to resolve it (as I am beginner and learning). It would be great to find a solution. Thank you in advance and looking for a positive response.

We could create a sequence column ('rn'), reshape to 'long' format with pivot_longer on the first data, join with the second data ('df2') and do a group by calculation on the 'r1', 'r2' columns in reframe
library(dplyr) # version >= 1.1.0
library(tidyr)
df1 %>%
mutate(rn = row_number()) %>%
pivot_longer(cols= starts_with("bam"), names_to = "bam_file") %>%
left_join(df2) %>%
reframe(across(r1:r2, ~ sum((value - .x)^2)/n()), .by = "rn")
-output
# A tibble: 9 × 3
rn r1 r2
<int> <dbl> <dbl>
1 1 152. 145.
2 2 89140. 88468.
3 3 57823. 57413.
4 4 6195. 6148
5 5 8007. 7859.
6 6 396. 373.
7 7 509. 543.
8 8 60.8 47.1
9 9 15.5 6.88
data
df1 <- structure(list(chr = c("chr1", "chr1", "chr1", "chr1", "chr1",
"chr1", "chr1", "chr1", "chr1"), start = c(3531569L, 3670538L,
4491645L, 4496542L, 4571267L, 4688213L, 4688800L, 4688800L, 4688800L
), end = c(3531966L, 3672624L, 4493854L, 4497750L, 4572265L,
4688719L, 4688919L, 4688919L, 4688919L), strand = c("-", "-",
"-", "-", "-", "-", "-", "-", "-"), bam1 = c(2L, 251L, 220L,
115L, 64L, 39L, 20L, 2L, 1L), bam2 = c(2L, 50L, 46L, 41L, 32L,
10L, 30L, 4L, 2L), bam3 = c(1L, 170L, 179L, 100L, 77L, 20L, 10L,
6L, 3L), bam4 = c(4L, 165L, 167L, 67L, 44L, 20L, 20L, 8L, 4L),
bam5 = c(8L, 294L, 275L, 114L, 76L, 14L, 14L, 10L, 5L), bam6 = c(36L,
259L, 332L, 69L, 130L, 23L, 55L, 12L, 6L), bam7 = c(21L,
665L, 414L, 42L, 179L, 25L, 17L, 14L, 7L), bam8 = c(1L, 86L,
77L, 63L, 27L, 17L, 20L, 16L, 8L)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9"))
df2 <- structure(list(bam_file = c("bam1", "bam2", "bam3", "bam4", "bam5",
"bam6", "bam7", "bam8"), r1 = c(2L, 9L, 1L, 1L, 1L, 8L, 3L, 3L
), r2 = c(1L, 3L, 4L, 5L, 1L, 6L, 7L, 2L)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8"))

Related

R dividing all elements on the same row

Hello i have a dataFrame with 15167 rows like this:
> head(datExpr)
F02234_L1_S1_L001 F02235_L1_S2_L001
ENSG00000182199 1546 1710
ENSG00000157870 3491 2951
ENSG00000077092 35 22
ENSG00000047936 49 12
ENSG00000156282 0 0
ENSG00000183856 265 365
F02236_L1_S3_L001 F02237_L1_S4_L001
ENSG00000182199 3530 3582
ENSG00000157870 2372 2360
ENSG00000077092 111 34
ENSG00000047936 43 6
ENSG00000156282 0 0
ENSG00000183856 768 1489
and a list with 15167 elements like this:
> distances
[[1]]
end_position
1 12883
2 100722
3 119629
4 57174
5 78317
6 453372
7 21772
8 30102
9 116734
..........
I want to divide all in the dataframe by equivalent values of my list. For example 1546/12883,1710/12883, 3530/12883 ... and 3491/100722, 2951/100722...
How can i do that with R?
We could directly divide the list element after extracting as it gets recycled
datExpr/distances[[1]]$end_position
-output
F02234_L1_S1_L001 F02235_L1_S2_L001 F02236_L1_S3_L001 F02237_L1_S4_L001
ENSG00000182199 0.1200031049 0.1327330591 0.2740045021 0.2780408290
ENSG00000157870 0.0346597566 0.0292984651 0.0235499692 0.0234308294
ENSG00000077092 0.0002925712 0.0001839019 0.0009278687 0.0002842120
ENSG00000047936 0.0008570329 0.0002098856 0.0007520901 0.0001049428
ENSG00000156282 0.0000000000 0.0000000000 0.0000000000 0.0000000000
ENSG00000183856 0.0005845090 0.0008050784 0.0016939732 0.0032842787
data
datExpr <- structure(list(F02234_L1_S1_L001 = c(1546L, 3491L, 35L, 49L,
0L, 265L), F02235_L1_S2_L001 = c(1710L, 2951L, 22L, 12L, 0L,
365L), F02236_L1_S3_L001 = c(3530L, 2372L, 111L, 43L, 0L, 768L
), F02237_L1_S4_L001 = c(3582L, 2360L, 34L, 6L, 0L, 1489L)), class = "data.frame", row.names = c("ENSG00000182199",
"ENSG00000157870", "ENSG00000077092", "ENSG00000047936", "ENSG00000156282",
"ENSG00000183856"))
distances <- list(structure(list(end_position = c(12883L, 100722L, 119629L,
57174L, 78317L, 453372L)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6")))

Filter a dataframe based on a shorter dataframe

I have the following dataframes I've simplified for practical purposes:
head(coords_int)
seqnames start end
1 chr2 181529780 181533313
2 chr2 98396674 98396940
3 chr5 64919375 64919395
4 chr9 2795948 2797647
5 chr7 138873527 138873574
6 chr4 154736072 154736138
7 chr6 10762723 10769212
8 chr10 93614727 93614773
9 chr17 76539181 76539300
10 chr6 99608741 99608872
11 chr10 47330654 47330828
12 chr10 47331176 47331237
13 chr10 93612154 93612575
14 chr10 84248910 84249043
15 chr17 28547999 28548101
16 chr17 28548592 28548705
17 chr11 46701067 46701141
18 chr16 49847678 49847918
19 chr16 49822670 49822738
head(pdoms_protein)
tx_id seqnames start end width strand exon_id exon_rank cds_ok
1 ENST00000339098 2 181573753 181573876 124 - ENSE00003634697 3 TRUE
2 ENST00000339098 2 181573753 181573876 124 - ENSE00003634697 3 TRUE
3 ENST00000339098 2 181566058 181566121 64 - ENSE00003523731 4 TRUE
4 ENST00000393504 2 98395844 98396397 554 + ENSE00000963920 8 TRUE
5 ENST00000393504 2 98395844 98396397 554 + ENSE00000963920 8 TRUE
6 ENST00000393504 2 98396674 98396940 267 + ENSE00000963920 8 TRUE
7 ENST00000381070 5 64774694 64774787 94 + ENSE00003522928 2 TRUE
8 ENST00000381070 5 64774694 64774787 94 + ENSE00003522928 2 TRUE
9 ENST00000381070 5 64774694 64774787 94 + ENSE00003522928 2 TRUE
10 ENST00000381070 5 64781921 64782033 113 + ENSE00003582136 3 TRUE
11 ENST00000381070 5 64781921 64782033 113 + ENSE00003582136 3 TRUE
12 ENST00000382082 9 2718229 2718276 48 + ENSE00001490869 1 TRUE
13 ENST00000382082 9 2718229 2718276 48 + ENSE00001490869 1 TRUE
14 ENST00000422774 7 138881388 138881584 197 - ENSE00001088065 11 TRUE
15 ENST00000422774 7 138879538 138879653 116 - ENSE00001088074 12 TRUE
16 ENST00000422774 7 138871157 138871362 206 - ENSE00001088067 13 TRUE
17 ENST00000336356 4 154744456 154744845 390 + ENSE00001344788 2 TRUE
18 ENST00000502525 4 154744456 154744530 75 + ENSE00002048458 4 FALSE
19 ENST00000507827 4 154744456 154744845 390 + ENSE00001344788 2 TRUE
20 ENST00000313243 6 10830548 10830639 92 - ENSE00003696993 2 TRUE
21 ENST00000313243 6 10830548 10830639 92 - ENSE00003696993 2 TRUE
22 ENST00000313243 6 10830548 10830639 92 - ENSE00003696993 2 TRUE
23 ENST00000313243 6 10830548 10830639 92 - ENSE00003696993 2 TRUE
protein_start protein_end protein_domain_id protein_domain_source interpro_accession
1 164 339 PS50146 pfscan IPR001206
2 164 339 PF00781 pfam IPR001206
3 164 339 PS50146 pfscan IPR001206
4 171 409 PF16526 pfam IPR032406
5 171 409 SM00100 smart IPR000595
6 502 590 PS50042 pfscan IPR000595
7 16 166 PR00153 prints IPR002130
8 16 166 PR00153 prints IPR002130
9 16 166 PR00153 prints IPR002130
10 16 166 PS50072 pfscan IPR002130
11 16 166 PS00170 scanprosite IPR020892
12 164 179 PR01494 prints IPR003971
13 164 179 PR01491 prints IPR003968
14 1039 1702 PF12877 pfam IPR024606
15 1039 1702 PF12877 pfam IPR024606
16 1039 1702 PF12877 pfam IPR024606
17 44 173 PF04970 pfam IPR007053
18 44 68 PF04970 pfam IPR007053
19 44 173 PF04970 pfam IPR007053
20 4 284 PS50011 pfscan IPR000719
21 4 284 PS00107 scanprosite IPR017441
22 4 284 PS00108 scanprosite IPR008271
23 4 284 SSF56112 superfamily IPR011009
prot_dom_start prot_dom_end gene_name
1 164 339 CERKL
2 170 334 CERKL
3 164 339 CERKL
4 598 668 CNGA3
5 482 606 CNGA3
6 482 596 CNGA3
7 125 140 CWC27
8 97 112 CWC27
9 112 124 CWC27
10 19 166 CWC27
11 49 66 CWC27
12 187 199 KCNV2
13 410 424 KCNV2
14 1039 1702 KIAA1549
15 1039 1702 KIAA1549
16 1039 1702 KIAA1549
17 44 173 LRAT
18 44 68 LRAT
19 44 173 LRAT
20 4 284 MAK
21 10 33 MAK
22 121 133 MAK
23 1 285 MAK
I would like to know if any of the coords_int$start are part of the pdoms_protein$start / pdoms_protein$end range and the same for the coords_int$end and then filter only the data that falls in this category.
I'd tried
library(tidyverse)
pdoms_protein %>%
mutate(dom.ok = 98396674>= start & 98396674<= end) %>%
filter(dom.ok == "TRUE")
And it works but only for one value at a time. Is there a more practical way to do it all at once?
We could do it with fuzzyjoin:
library(fuzzyjoin)
library(dplyr)
long_coords_int <- coords_int %>%
pivot_longer(-seqnames)
fuzzy_left_join(long_coords_int, pdoms_protein[3:4], by = c("value" = "start", "value" = "end"),
match_fun =list(`>=`, `<=`)) %>%
mutate(found = c(NA, "YES")[(!is.na(start)) + 1])
seqnames name value start end found
<chr> <chr> <int> <int> <int> <chr>
1 chr2 start 181529780 NA NA NA
2 chr2 end 181533313 NA NA NA
3 chr2 start 98396674 98396674 98396940 YES
4 chr2 end 98396940 98396674 98396940 YES
5 chr5 start 64919375 NA NA NA
6 chr5 end 64919395 NA NA NA
7 chr9 start 2795948 NA NA NA
8 chr9 end 2797647 NA NA NA
9 chr7 start 138873527 NA NA NA
10 chr7 end 138873574 NA NA NA
# ... with 28 more rows
coords_int <- structure(list(seqnames = c("chr2", "chr2", "chr5", "chr9", "chr7",
"chr4", "chr6", "chr10", "chr17", "chr6", "chr10", "chr10", "chr10",
"chr10", "chr17", "chr17", "chr11", "chr16", "chr16"), start = c(181529780L,
98396674L, 64919375L, 2795948L, 138873527L, 154736072L, 10762723L,
93614727L, 76539181L, 99608741L, 47330654L, 47331176L, 93612154L,
84248910L, 28547999L, 28548592L, 46701067L, 49847678L, 49822670L
), end = c(181533313L, 98396940L, 64919395L, 2797647L, 138873574L,
154736138L, 10769212L, 93614773L, 76539300L, 99608872L, 47330828L,
47331237L, 93612575L, 84249043L, 28548101L, 28548705L, 46701141L,
49847918L, 49822738L)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19"))
pdoms_protein <- structure(list(tx_id = c("ENST00000339098", "ENST00000339098",
"ENST00000339098", "ENST00000393504", "ENST00000393504", "ENST00000393504",
"ENST00000381070", "ENST00000381070", "ENST00000381070", "ENST00000381070",
"ENST00000381070", "ENST00000382082", "ENST00000382082", "ENST00000422774",
"ENST00000422774", "ENST00000422774", "ENST00000336356", "ENST00000502525",
"ENST00000507827", "ENST00000313243", "ENST00000313243", "ENST00000313243",
"ENST00000313243"), seqnames = c(2L, 2L, 2L, 2L, 2L, 2L, 5L,
5L, 5L, 5L, 5L, 9L, 9L, 7L, 7L, 7L, 4L, 4L, 4L, 6L, 6L, 6L, 6L
), start = c(181573753L, 181573753L, 181566058L, 98395844L, 98395844L,
98396674L, 64774694L, 64774694L, 64774694L, 64781921L, 64781921L,
2718229L, 2718229L, 138881388L, 138879538L, 138871157L, 154744456L,
154744456L, 154744456L, 10830548L, 10830548L, 10830548L, 10830548L
), end = c(181573876L, 181573876L, 181566121L, 98396397L, 98396397L,
98396940L, 64774787L, 64774787L, 64774787L, 64782033L, 64782033L,
2718276L, 2718276L, 138881584L, 138879653L, 138871362L, 154744845L,
154744530L, 154744845L, 10830639L, 10830639L, 10830639L, 10830639L
), width = c(124L, 124L, 64L, 554L, 554L, 267L, 94L, 94L, 94L,
113L, 113L, 48L, 48L, 197L, 116L, 206L, 390L, 75L, 390L, 92L,
92L, 92L, 92L), strand = c("-", "-", "-", "+", "+", "+", "+",
"+", "+", "+", "+", "+", "+", "-", "-", "-", "+", "+", "+", "-",
"-", "-", "-"), exon_id = c("ENSE00003634697", "ENSE00003634697",
"ENSE00003523731", "ENSE00000963920", "ENSE00000963920", "ENSE00000963920",
"ENSE00003522928", "ENSE00003522928", "ENSE00003522928", "ENSE00003582136",
"ENSE00003582136", "ENSE00001490869", "ENSE00001490869", "ENSE00001088065",
"ENSE00001088074", "ENSE00001088067", "ENSE00001344788", "ENSE00002048458",
"ENSE00001344788", "ENSE00003696993", "ENSE00003696993", "ENSE00003696993",
"ENSE00003696993"), exon_rank = c(3L, 3L, 4L, 8L, 8L, 8L, 2L,
2L, 2L, 3L, 3L, 1L, 1L, 11L, 12L, 13L, 2L, 4L, 2L, 2L, 2L, 2L,
2L), cds_ok = c(TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE,
TRUE, TRUE, TRUE, TRUE, TRUE), protein_start = c(164L, 164L,
164L, 171L, 171L, 502L, 16L, 16L, 16L, 16L, 16L, 164L, 164L,
1039L, 1039L, 1039L, 44L, 44L, 44L, 4L, 4L, 4L, 4L), protein_end = c(339L,
339L, 339L, 409L, 409L, 590L, 166L, 166L, 166L, 166L, 166L, 179L,
179L, 1702L, 1702L, 1702L, 173L, 68L, 173L, 284L, 284L, 284L,
284L), protein_domain_id = c("PS50146", "PF00781", "PS50146",
"PF16526", "SM00100", "PS50042", "PR00153", "PR00153", "PR00153",
"PS50072", "PS00170", "PR01494", "PR01491", "PF12877", "PF12877",
"PF12877", "PF04970", "PF04970", "PF04970", "PS50011", "PS00107",
"PS00108", "SSF56112"), protein_domain_source = c("pfscan", "pfam",
"pfscan", "pfam", "smart", "pfscan", "prints", "prints", "prints",
"pfscan", "scanprosite", "prints", "prints", "pfam", "pfam",
"pfam", "pfam", "pfam", "pfam", "pfscan", "scanprosite", "scanprosite",
"superfamily"), interpro_accession = c("IPR001206", "IPR001206",
"IPR001206", "IPR032406", "IPR000595", "IPR000595", "IPR002130",
"IPR002130", "IPR002130", "IPR002130", "IPR020892", "IPR003971",
"IPR003968", "IPR024606", "IPR024606", "IPR024606", "IPR007053",
"IPR007053", "IPR007053", "IPR000719", "IPR017441", "IPR008271",
"IPR011009"), prot_dom_start = c(164L, 170L, 164L, 598L, 482L,
482L, 125L, 97L, 112L, 19L, 49L, 187L, 410L, 1039L, 1039L, 1039L,
44L, 44L, 44L, 4L, 10L, 121L, 1L), prot_dom_end = c(339L, 334L,
339L, 668L, 606L, 596L, 140L, 112L, 124L, 166L, 66L, 199L, 424L,
1702L, 1702L, 1702L, 173L, 68L, 173L, 284L, 33L, 133L, 285L),
gene_name = c("CERKL", "CERKL", "CERKL", "CNGA3", "CNGA3",
"CNGA3", "CWC27", "CWC27", "CWC27", "CWC27", "CWC27", "KCNV2",
"KCNV2", "KIAA1549", "KIAA1549", "KIAA1549", "LRAT", "LRAT",
"LRAT", "MAK", "MAK", "MAK", "MAK")), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "20", "21", "22", "23"))
You could use data.table::foverlaps(), like this:
library(data.table)
setDT(coords_int)
setDT(pdoms_protein)
setkey(coords_int,start,end)
foverlaps(pdoms_protein,coords_int)
Also see package IRanges
Not tested, but I think you could make a function like this
pdoms_protein %>%
mutate(dom.ok = isok(start,end)) %>%
filter(dom.ok == "TRUE")
isok <- function(local_start, local_end) {
df <- coords_int %>%
filter(start >= local_start & end <= local_end)
return count(df) > 0
}

how do I Columns to x-axis

Please help i am trying to make all then columns into x-axis and the make side by side bars later by date
this is my data i really tried but to no avail
dateVisited hh_visited hh_ind_confirmed new_in_mig out_mig deaths HOH_death Preg_Obs Preg_Outcome child_forms
102 2020-07-21 292 1170 131 86 18 7 3 14 79
103 2020-07-22 400 1553 115 100 25 10 11 18 107
104 2020-07-23 381 1458 103 67 21 9 5 23 87
105 2020-07-24 345 1379 90 98 12 4 3 20 89
106 2020-07-25 436 1585 131 119 13 2 7 20 117
107 2020-07-26 0 0 0 0 0 0 0
0 0
I think you're looking for something like this:
library(tidyr)
library(ggplot2)
df %>%
pivot_longer(cols = -1) %>%
ggplot(aes(name, value)) +
geom_col(aes(fill = dateVisited), width = 0.6,
position = position_dodge(width = 0.8)) +
guides(x = guide_axis(angle = 45))
Reproducible Data from question
df <- structure(list(dateVisited = structure(1:6, .Label = c("2020-07-21",
"2020-07-22", "2020-07-23", "2020-07-24", "2020-07-25", "2020-07-26"
), class = "factor"), hh_visited = c(292L, 400L, 381L, 345L,
436L, 0L), hh_ind_confirmed = c(1170L, 1553L, 1458L, 1379L, 1585L,
0L), new_in_mig = c(131L, 115L, 103L, 90L, 131L, 0L), out_mig = c(86L,
100L, 67L, 98L, 119L, 0L), deaths = c(18L, 25L, 21L, 12L, 13L,
0L), HOH_death = c(7L, 10L, 9L, 4L, 2L, 0L), Preg_Obs = c(3L,
11L, 5L, 3L, 7L, 0L), Preg_Outcome = c(14L, 18L, 23L, 20L, 20L,
0L), child_forms = c(79L, 107L, 87L, 89L, 117L, 0L)), class = "data.frame",
row.names = c("102", "103", "104", "105", "106", "107"))
Your data cannot be used easily since it requires time to format it into something that could ingested by R. Here is something to get you started. I made up a hypothetical dataframe of 4 columns that resemble your data, use the function melt from reshape2 package to format the data such that it is understandable by ggplot2 package, and use ggplot2 package to generate a bar plot.
df <- data.frame(dateVisited = seq(as.Date('2019-01-01'), as.Date('2019-12-31'), 30),
hh_visited = runif(13, 0, 436),
hh_ind_confirmed = runif(13, 0, 1585),
new_in_mig = runif(13, 0, 131))
df <- reshape2::melt(df, id.vars = 'dateVisited')
ggplot(data = df, aes(x = dateVisited, y = value, fill = variable))+
geom_col(position = 'dodge')

How to Create Values based on Start-Stop Info in Separate Column

I have a very messy dataset created by a research device. This data shows a physiological measure ("Physio") for every few milliseconds ("Time"). The output lists several user messages, such as when a trial starts ("START_TRIAL n"), when a trial ends ("STOP_TRIAL"), and other random things that may be of interest to the researcher. Some times the "START_TRIAL n" message is repeated consecutively, and sometimes when there is no message, a simple "0" is left in what would otherwise be a blank cell.
I am hoping to create a new column that will signify which trial the current case belongs to. (See example data below).
Is there a way to do this with dplyr and mutate? I am wondering if I may need to do an if-then statement that changes the values of a new column for every case, but surely there's a more elegant solution? (Thank you in advance for helping out this newbie!)
Time Physio Cond
1 34 START_TRIAL 1
2 33 0
3 25 RANDOM_MSG
4 43 STOP_TRIAL
5 27 START_TRIAL 2
6 54 START_TRIAL 2
7 32 0
8 54 RANDOM_MSG
9 23 STOP_TRIAL
structure(list(Time = 1:9, Physio = c(34L, 33L, 25L, 43L, 27L,
54L, 32L, 54L, 23L), Cond = structure(c(4L, 2L, 3L, 6L, 5L, 5L,
2L, 3L, 6L), .Label = c("", "0", "RANDOM_MSG", "START_TRIAL 1",
"START_TRIAL 2", "STOP_TRIAL"), class = "factor")), .Names = c("Time",
"Physio", "Cond"), row.names = c(NA, 9L), class = "data.frame")
into
Time Physio Trial Cond
1 34 1 START_TRIAL 1
2 33 1 0
3 25 1 RANDOM_MSG
4 43 1 STOP_TRIAL
5 27 2 START_TRIAL 2
6 54 2 START_TRIAL 2
7 32 2 0
8 54 2 RANDOM_MSG
9 23 2 STOP_TRIAL
structure(list(Time = 1:9, Physio = c(34L, 33L, 25L, 43L, 27L,
54L, 32L, 54L, 23L), Trial = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L), Cond = structure(c(4L, 2L, 3L, 6L, 5L, 5L, 2L, 3L, 6L), .Label = c("",
"0", "RANDOM_MSG", "START_TRIAL 1", "START_TRIAL 2", "STOP_TRIAL"
), class = "factor")), .Names = c("Time", "Physio", "Trial",
"Cond"), row.names = c(NA, 9L), class = "data.frame")
One option would be to identify the 'START_TRIAL' with grep, do a match to get the index and fill the NA elements with the previous non-NA adjacent element
library(dplyr)
library(tidyr)
df1 %>%
mutate(Trial = match(PhysioCond, unique(grep("START_TRIAL",
PhysioCond, value = TRUE)))) %>%
fill(Trial)
# Time PhysioCond Trial
#1 34 START_TRIAL 1 1
#2 33 0 1
#3 25 RANDOM_MSG 1
#4 43 STOP_TRIAL 1
#5 27 START_TRIAL 2 2
#6 54 START_TRIAL 2 2
#7 32 0 2
#8 54 RANDOM_MSG 2
#9 23 STOP_TRIAL 2
NOTE: Not clear about the column name, but the logic should work well
data
df1 <- structure(list(Time = c(34L, 33L, 25L, 43L, 27L, 54L, 32L, 54L,
23L), PhysioCond = c("START_TRIAL 1", "0", "RANDOM_MSG", "STOP_TRIAL",
"START_TRIAL 2", "START_TRIAL 2", "0", "RANDOM_MSG", "STOP_TRIAL"
)), class = "data.frame", row.names = c("1", "2", "3", "4", "5",
"6", "7", "8", "9"))

How to apply multiple functions on grouped tibble using dplyr

I have the following tibble:
df <- structure(list(treatment = c("control", "control", "control",
"control", "control", "control", "treated", "treated", "treated",
"treated", "treated", "treated"), `0610005C13Rik` = c(5L, 2L,
2L, 5L, 1L, 0L, 6L, 1L, 0L, 5L, 1L, 2L), `0610007P14Rik` = c(300L,
249L, 166L, 104L, 248L, 136L, 164L, 121L, 191L, 187L, 289L, 169L
), `0610009B22Rik` = c(251L, 158L, 92L, 82L, 239L, 107L, 147L,
96L, 153L, 200L, 211L, 80L), `0610009L18Rik` = c(42L, 17L, 16L,
17L, 10L, 6L, 18L, 1L, 15L, 8L, 19L, 13L), `0610009O20Rik` = c(187L,
77L, 86L, 37L, 81L, 24L, 83L, 57L, 98L, 83L, 113L, 48L), `0610010B08Rik` = c(16L,
3L, 6L, 3L, 2L, 3L, 3L, 2L, 3L, 2L, 3L, 1L)), .Names = c("treatment",
"0610005C13Rik", "0610007P14Rik", "0610009B22Rik", "0610009L18Rik",
"0610009O20Rik", "0610010B08Rik"), row.names = c(NA, -12L), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), vars = "treatment", drop = TRUE, indices = list(
0:5, 6:11), group_sizes = c(6L, 6L), biggest_group_size = 6L, labels = structure(list(
treatment = c("control", "treated")), row.names = c(NA, -2L
), class = "data.frame", vars = "treatment", drop = TRUE, .Names = "treatment"))
That looks like this:
Source: local data frame [12 x 7]
Groups: treatment [2]
treatment `0610005C13Rik` `0610007P14Rik` `0610009B22Rik` `0610009L18Rik` `0610009O20Rik` `0610010B08Rik`
<chr> <int> <int> <int> <int> <int> <int>
1 control 5 300 251 42 187 16
2 control 2 249 158 17 77 3
3 control 2 166 92 16 86 6
4 control 5 104 82 17 37 3
5 control 1 248 239 10 81 2
6 control 0 136 107 6 24 3
7 treated 6 164 147 18 83 3
8 treated 1 121 96 1 57 2
9 treated 0 191 153 15 98 3
10 treated 5 187 200 8 83 2
11 treated 1 289 211 19 113 3
12 treated 2 169 80 13 48 1
What I want to do is to calculate mean and coefficient variation (cv) based on grouped treatment. The CV is basically mean / sd sd / mean. The final expected result looks like this:
gene_symbol control.mean treated.mean control.cv treated.cv
0610005C13Rik 2.5000 2.500000 0.829457 ...
0610007P14Rik 200.5000 186.833333 ... ...
... etc ...
How can I do that using dplyr?
We can gather and then get the mean/sd
library(tidyverse)
df %>%
gather(gene_symbol, Val, -treatment) %>%
group_by(treatment, gene_symbol) %>%
summarise(Mean = mean(Val), cv = sd(Val)/mean(Val)) %>%
gather(Var1, Val, -treatment,-gene_symbol) %>%
unite(new, treatment, Var1) %>%
spread(new, Val)
# A tibble: 6 × 5
# gene_symbol control_cv control_Mean treated_cv treated_Mean
#* <chr> <dbl> <dbl> <dbl> <dbl>
#1 0610005C13Rik 0.8294577 2.5000 0.9715966 2.500000
#2 0610007P14Rik 0.3809605 200.5000 0.2992429 186.833333
#3 0610009B22Rik 0.4823019 154.8333 0.3582799 147.833333
#4 0610009L18Rik 0.6983225 18.0000 0.5515103 12.333333
#5 0610009O20Rik 0.6996217 82.0000 0.3040676 80.333333
#6 0610010B08Rik 0.9672317 5.5000 0.3499271 2.333333
Or another option is to get the mean, cv with summarise_all, then reshape into 'long' format and reconvert it back to 'wide'
df %>%
group_by(treatment) %>%
summarise_all(funs(mean = mean(.), cv = sd(.)/mean(.))) %>%
gather(Var, Val, -treatment) %>%
separate(Var, into = c('gene_symbol', 'Var2')) %>%
unite(new, treatment, Var2) %>%
spread(new, Val)
# A tibble: 6 × 5
# gene_symbol control_cv control_mean treated_cv treated_mean
#* <chr> <dbl> <dbl> <dbl> <dbl>
#1 0610005C13Rik 0.8294577 2.5000 0.9715966 2.500000
#2 0610007P14Rik 0.3809605 200.5000 0.2992429 186.833333
#3 0610009B22Rik 0.4823019 154.8333 0.3582799 147.833333
#4 0610009L18Rik 0.6983225 18.0000 0.5515103 12.333333
#5 0610009O20Rik 0.6996217 82.0000 0.3040676 80.333333
#6 0610010B08Rik 0.9672317 5.5000 0.3499271 2.333333
Or we can do this with melt/dcast from data.table
library(data.table)
dcast(melt(setDT(df), id.var = "treatment", variable.name = "gene_symbol"
)[, .(mean = mean(value), cv = sd(value)/mean(value)), .(treatment, gene_symbol)
], gene_symbol~treatment, value.var = c('mean', 'cv'))
# gene_symbol mean_control mean_treated cv_control cv_treated
#1: 0610005C13Rik 2.5000 2.500000 0.8294577 0.9715966
#2: 0610007P14Rik 200.5000 186.833333 0.3809605 0.2992429
#3: 0610009B22Rik 154.8333 147.833333 0.4823019 0.3582799
#4: 0610009L18Rik 18.0000 12.333333 0.6983225 0.5515103
#5: 0610009O20Rik 82.0000 80.333333 0.6996217 0.3040676
#6: 0610010B08Rik 5.5000 2.333333 0.9672317 0.3499271
EDIT: To reflect the changes in the OP's formula
Here is an approach using a join
library("tidyverse")
df %>% gather(key = gene_symbol, value = value,-treatment) %>%
group_by(treatment, gene_symbol) %>%
summarise(mean = mean(value), cv = mean / sd(value)) %>%
ungroup() %>%
left_join(
x = filter(., treatment == "control"),
y = filter(., treatment == "treated"),
by = "gene_symbol",
suffix = c(".control", ".treated")
) %>%
select(-starts_with("treatment"))

Resources