dplyr - subtract based on condition from two different data frames - r

I have a data frame which looks like the following:
quant <- structure(list(Name = structure(c(158L, 159L, 160L, 161L, 162L,
163L, 164L, 165L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48L, 98L,
99L, 100L, 101L), .Label = c("abc_02_NEHC_025_100_A", "abc_02_NEHC_025_100_B",
"abc_02_NEHC_025_100_C", "abc_02_NEHC_025_100_D", "abc_02_NEHC_025_100_E",
"abc_02_NEHC_025_100_F", "abc_02_NEHC_025_100_G", "abc_02_NEHC_025_100_H",
"abc_02_NEHC_05_100_A", "abc_02_NEHC_05_100_B", "abc_02_NEHC_05_100_C",
"abc_02_NEHC_05_100_D", "abc_02_NEHC_05_100_E", "abc_02_NEHC_05_100_F",
"abc_02_NEHC_05_100_G", "abc_02_NEHC_05_100_H", "abc_02_NEHC_100_1_A",
"abc_02_NEHC_100_1_B", "abc_02_NEHC_100_1_C", "abc_02_NEHC_100_1_D",
"abc_02_NEHC_100_1_E", "abc_02_NEHC_100_1_F", "abc_02_NEHC_100_1_G",
"abc_02_NEHC_100_1_H", "abc_02_VL_025_100_A", "abc_02_VL_025_100_B",
"abc_02_VL_025_100_C", "abc_02_VL_025_100_D", "abc_02_VL_025_100_E",
"abc_02_VL_025_100_F", "abc_02_VL_025_100_G", "abc_02_VL_025_100_H",
"abc_02_VL_05_100_A", "abc_02_VL_05_100_B", "abc_02_VL_05_100_C",
"abc_02_VL_05_100_D", "abc_02_VL_05_100_E", "abc_02_VL_05_100_F",
"abc_02_VL_05_100_G", "abc_02_VL_05_100_H", "abc_02_VL_1_100_A",
"abc_02_VL_1_100_B", "abc_02_VL_1_100_C", "abc_02_VL_1_100_D",
"abc_02_VL_1_100_E", "abc_02_VL_1_100_F", "abc_02_VL_1_100_G",
"abc_02_VL_1_100_H", "BACKGROUND_NEHC_0125_100_A", "BACKGROUND_NEHC_0125_100_B",
"BACKGROUND_NEHC_0125_100_C", "BACKGROUND_NEHC_0125_100_D", "BACKGROUND_NEHC_0125_100_E",
"BACKGROUND_NEHC_0125_100_F", "BACKGROUND_NEHC_0125_100_G", "BACKGROUND_NEHC_025_100_A",
"BACKGROUND_NEHC_025_100_B", "BACKGROUND_NEHC_025_100_C", "BACKGROUND_NEHC_025_100_D",
"BACKGROUND_NEHC_025_100_F", "BACKGROUND_NEHC_025_100_G", "BACKGROUND_NEHC_05_100_A",
"BACKGROUND_NEHC_05_100_B", "BACKGROUND_NEHC_05_100_C", "BACKGROUND_NEHC_05_100_D",
"BACKGROUND_NEHC_05_100_F", "BACKGROUND_NEHC_05_100_G", "BACKGROUND_NEHC_05_100_H",
"BACKGROUND_NEHC_1_100_A", "BACKGROUND_NEHC_1_100_B", "BACKGROUND_NEHC_1_100_C",
"BACKGROUND_NEHC_1_100_D", "BACKGROUND_NEHC_1_100_E", "BACKGROUND_NEHC_1_100_F",
"BACKGROUND_NEHC_1_100_G", "BACKGROUND_VL_0125_100_A", "BACKGROUND_VL_0125_100_B",
"BACKGROUND_VL_0125_100_C", "BACKGROUND_VL_0125_100_D", "BACKGROUND_VL_0125_100_E",
"BACKGROUND_VL_0125_100_F", "BACKGROUND_VL_025_100_A", "BACKGROUND_VL_025_100_B",
"BACKGROUND_VL_025_100_C", "BACKGROUND_VL_025_100_D", "BACKGROUND_VL_025_100_E",
"BACKGROUND_VL_025_100_F", "BACKGROUND_VL_025_100_G", "BACKGROUND_VL_025_100_H",
"BACKGROUND_VL_05_100_A", "BACKGROUND_VL_05_100_B", "BACKGROUND_VL_05_100_C",
"BACKGROUND_VL_05_100_D", "BACKGROUND_VL_05_100_E", "BACKGROUND_VL_05_100_F",
"BACKGROUND_VL_05_100_G", "BACKGROUND_VL_05_100_H", "BACKGROUND_VL_1_100_A",
"BACKGROUND_VL_1_100_B", "BACKGROUND_VL_1_100_C", "BACKGROUND_VL_1_100_D",
"BACKGROUND_VL_1_100_E", "BACKGROUND_VL_1_100_F", "BACKGROUND_VL_1_100_G",
"BACKGROUND_VL_1_100_H", "Epq_11_NEHC_0125_100_a", "Epq_11_NEHC_0125_100_B",
"Epq_11_NEHC_0125_100_C", "Epq_11_NEHC_0125_100_D", "Epq_11_NEHC_0125_100_E",
"Epq_11_NEHC_0125_100_F", "Epq_11_NEHC_0125_100_G", "Epq_11_NEHC_025_100_a",
"Epq_11_NEHC_025_100_B", "Epq_11_NEHC_025_100_C", "Epq_11_NEHC_025_100_D",
"Epq_11_NEHC_025_100_E", "Epq_11_NEHC_05_100_a", "Epq_11_NEHC_05_100_B",
"Epq_11_NEHC_05_100_C", "Epq_11_NEHC_05_100_D", "Epq_11_NEHC_05_100_E",
"Epq_11_NEHC_05_100_F", "Epq_11_NEHC_05_100_G", "Epq_11_NEHC_05_100_H",
"Epq_11_NEHC_1_100_a", "Epq_11_NEHC_1_100_B", "Epq_11_NEHC_1_100_C",
"Epq_11_NEHC_1_100_D", "Epq_11_NEHC_1_100_E", "Epq_11_NEHC_1_100_F",
"Epq_11_NEHC_1_100_G", "Epq_11_NEHC_1_100_H", "Epq_11_VL_0125_100_A",
"Epq_11_VL_0125_100_B", "Epq_11_VL_0125_100_C", "Epq_11_VL_0125_100_D",
"Epq_11_VL_0125_100_E", "Epq_11_VL_0125_100_F", "Epq_11_VL_0125_100_G",
"Epq_11_VL_0125_100_H", "Epq_11_VL_025_100_A", "Epq_11_VL_025_100_B",
"Epq_11_VL_025_100_C", "Epq_11_VL_025_100_D", "Epq_11_VL_025_100_E",
"Epq_11_VL_025_100_F", "Epq_11_VL_025_100_G", "Epq_11_VL_025_100_H",
"Epq_11_VL_05_100_A", "Epq_11_VL_05_100_B", "Epq_11_VL_05_100_C",
"Epq_11_VL_05_100_D", "Epq_11_VL_05_100_E", "Epq_11_VL_05_100_F",
"Epq_11_VL_05_100_G", "Epq_11_VL_05_100_H", "Epq_11_VL_1_100_A",
"Epq_11_VL_1_100_B", "Epq_11_VL_1_100_C", "Epq_11_VL_1_100_D",
"Epq_11_VL_1_100_E", "Epq_11_VL_1_100_F", "Epq_11_VL_1_100_G",
"Epq_11_VL_1_100_H"), class = "factor"), conc_factor = structure(c(4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L), .Label = c("pep_0.125", "pep_0.25", "pep_0.5", "pep_1.0"
), class = "factor"), peptide_factor = structure(c(3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L), .Label = c("ABC", "Background", "EpQ_11"), class = "factor"),
serum_factor = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("NEHC",
"VL"), class = "factor"), mean_fluorescence = c(65535, 65535,
65534.93359, 65535, 65535, 65535, 65535, 65535, 21322.06055,
22704.08594, 22546.32617, 21801.30664, 21668.2168, 22054.40234,
21621.54688, 21516.33984, 17760.80273, 17886.12891, 18382.7832,
17531.80273)), class = "data.frame", row.names = c(NA, -20L
), .Names = c("Name", "conc_factor", "peptide_factor", "serum_factor",
"mean_fluorescence"))
This is actually just a slice (1:20) of my complete data frame. Just to have a better idea of my complete data frame, I am pasting below the levels of the variables conc_factor, peptide_factor and serum_factor:
levels(quant$conc_factor)
[1] "pep_0.125" "pep_0.25" "pep_0.5" "pep_1.0"
levels(quant$peptide_factor)
[1] "ABC" "Background" "EpQ_11"
levels(quant$serum_factor)
[1] "NEHC" "VL"
With the following command:
summary_backgrounds <- quant %>% filter(peptide_factor=="Background") %>% group_by(conc_factor, serum_factor) %>% summarise(avg_fluorescence_grouped = mean(mean_fluorescence))
conc_factor serum_factor avg_fluorescence_grouped
<fctr> <fctr> <dbl>
1 pep_0.125 NEHC 18439.70
2 pep_0.125 VL 16985.60
3 pep_0.25 NEHC 18666.52
4 pep_0.25 VL 17577.98
5 pep_0.5 NEHC 18300.47
6 pep_0.5 VL 18010.99
7 pep_1.0 NEHC 16103.50
8 pep_1.0 VL 17710.50
I obtained the mean_fluorescence values of the Background, for each conc_factor and serum_factor. What I am trying to do now is the following: I want to add a new variable to the data frame quant (named avg_fluorescence_minus_background) in which I will subtract the background values (summary_backgrounds$avg_fluorescence_grouped, considering conc_factor and serum_factor from each of the individual values on quant$mean_fluorescence.
For example, for quant[1, ], given that I have conc_factor=="pep_1.0" and serum_factor=="VL", my result would be 65535.00 - 17710.50 = 47824.5. and so on.

Read up on joins and you'll find they make this type of problem very easy to solve:
quant <- left_join(quant, summary_backgrounds, by = c("conc_factor", "serum_factor"))
mutate(quant, avg_flourescence_minus_bg = mean_fluorescence - avg_fluorescence_grouped)

Related

How to plot learning curves for binary data?

I would like to plot simple learning curves. My data looks like this:
id trial type choice
1 1 A 0
1 2 A 1
2 1 B 1
2 2 B 0
structure(list(id = c(2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L,
4L, 4L, 4L, 4L, 4L, 6L, 6L, 6L, 6L, 6L), trial = c(1L, 2L, 3L,
4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L,
5L), choice = c(0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 0L,
0L, 0L, 1L, 0L, 0L, 0L, 1L, 1L), type = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L), .Label = c("A", "A3", "B"), class = "factor")), row.names = c(1L,
2L, 3L, 4L, 5L, 31L, 32L, 33L, 34L, 35L, 61L, 62L, 63L, 64L,
65L, 91L, 92L, 93L, 94L, 95L), class = "data.frame")
ID, Trial and Type are integers and Choice is a factor. I would like to plot the choice the different groups have made per trial. How I imagine the graph (a 1 in the vector choice is consider correct):
The smoothness of the curves is an exaggeration.
I would also like to know how can I do calculations by coupling groups. For example, sum all the choices of group A during trials 1 to 10.
Thank you for your help!
Basically you want to summarize your data first, then plot it. You can do this easily with dplyr and ggplot2 for example if your data is stored in a data.frame named dd
library(dplyr)
library(ggplot2)
dd %>%
group_by(type, trial) %>%
summarize(correct=mean(choice)) %>%
ggplot() +
geom_line(aes(trial, correct, color=type))
For each type and trial we calculate the mean value of choice to get the percent of people who answered correctly. Then we plot that value for each trial with a line that's colored by the type.

Why assign() is behaving oddly in for() loop with dplyr pipes in R?

I need to loop different functions in dataframes allocated in my Global Environment and save the output of each "run" of the loop in a new dataframe that includes the initial name.
For this end, I'm using assign() with for() loop. It works well, except if I use the dplyr pipe %>%. The function itself works, but there is some error with the name assigned to the output dataframe. How can I fix this issue with %>% ? If not possible to fix, can I change assign() for another function?
This works well:
code1:
for(i in unique(table$V1)){
assign(paste0(i, "_target"),table[grepl(i,table$V1),])
}
Explanation: Selects unique entries in column 1 of the "table" and subset the rows with these entries to a new dataframe per entry. Output: the new dataframe name is "entry name" + "_target"
This doesn't work well (and I would like to know why):
code2:
for(i in mget(ls(pattern = "_target"))){
assign(paste0(i, "_slim"),data.frame(i %>% group_by(Sample.Name) %>% summarise(Mean_dC=mean(C__))))
}
Explanation: Selects all dataframes in the Global Env that name contains "_target". In each dataframe: it does the mean of the values "(C__)" associated to entries with same characters "(Sample.Name)". Should be output: the new dataframe name is "entry name_target" + "_slim". Real output: the new dataframe presents the mean of the same characters, but is named "c(aleatory numbers)_slim".
code2 input:
STA_target <- structure(list(Well = structure(c(8L, 9L, 10L, 21L, 22L, 23L,
33L, 34L, 35L, 46L, 47L, 48L, 58L, 59L, 60L, 73L, 74L, 75L, 85L,
86L, 87L, 97L, 98L, 99L), .Label = c("", "A1", "A10", "A11",
"A12", "A2", "A3", "A4", "A5", "A6", "A7", "A8", "A9", "Analysis Type",
"B1", "B10", "B11", "B12", "B2", "B3", "B4", "B5", "B6", "B7",
"B8", "B9", "C1", "C10", "C11", "C12", "C2", "C3", "C4", "C5",
"C6", "C7", "C8", "C9", "Chemistry", "D1", "D10", "D11", "D12",
"D2", "D3", "D4", "D5", "D6", "D7", "D8", "D9", "E1", "E10",
"E11", "E12", "E2", "E3", "E4", "E5", "E6", "E7", "E8", "E9",
"Endogenous Control", "Experiment File Name", "Experiment Run End Time",
"F1", "F10", "F11", "F12", "F2", "F3", "F4", "F5", "F6", "F7",
"F8", "F9", "G1", "G10", "G11", "G12", "G2", "G3", "G4", "G5",
"G6", "G7", "G8", "G9", "H1", "H10", "H11", "H12", "H2", "H3",
"H4", "H5", "H6", "H7", "H8", "H9", "Instrument Type", "Passive Reference",
"Reference Sample", "RQ Min/Max Confidence Level", "Well"), class = "factor"),
Sample.Name = c("Control_in", "Control_in", "Control_in",
"Sample2_in", "Sample2_in", "Sample2_in", "Sample5_in", "Sample5_in",
"Sample5_in", "Sample3_in", "Sample3_in", "Sample3_in", "Control_c",
"Control_c", "Control_c", "Sample2_c", "Sample2_c", "Sample2_c",
"Sample3_c", "Sample3_c", "Sample3_c", "Sample5_c", "Sample5_c",
"Sample5_c"), Target.Name = c("STA", "STA", "STA", "STA",
"STA", "STA", "STA", "STA", "STA", "STA", "STA", "STA", "STA",
"STA", "STA", "STA", "STA", "STA", "STA", "STA", "STA", "STA",
"STA", "STA"), Task = structure(c(3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L), .Label = c("", "Task", "UNKNOWN"), class = "factor"),
Reporter = structure(c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L
), .Label = c("", "Reporter", "SYBR"), class = "factor"),
Quencher = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
), .Label = c("", "None", "Quencher"), class = "factor"),
RQ = structure(c(12L, 12L, 12L, 8L, 8L, 8L, 6L, 6L, 6L, 11L,
11L, 11L, 1L, 1L, 1L, 5L, 5L, 5L, 14L, 14L, 14L, 18L, 18L,
18L), .Label = c("", "0.706286132", "0.714652956", "0.724364996",
"0.7665869", "0.828774512", "0.838611245", "0.846661508",
"0.863589227", "0.896049678", "0.929288268", "1", "1.829339266",
"15.57538891", "17.64183807", "27.67574501", "3.064466953",
"34.78881073", "41.82569504", "8.117406845", "8.884188652",
"RQ"), class = "factor"), RQ.Min = structure(c(9L, 9L, 9L,
7L, 7L, 7L, 8L, 8L, 8L, 10L, 10L, 10L, 1L, 1L, 1L, 2L, 2L,
2L, 21L, 21L, 21L, 17L, 17L, 17L), .Label = c("", "0.032458056",
"0.429091513", "0.460811675", "0.541289926", "0.611138761",
"0.674698055", "0.71383971", "0.742018044", "0.753834546",
"0.772591949", "0.7868222", "0.803419232", "0.820919514",
"0.826185584", "0.989573121", "22.58564949", "27.2142868",
"4.501103401", "4.745172024", "4.843928814", "4.979007244",
"9.076541901", "RQ Min"), class = "factor"), RQ.Max = structure(c(13L,
13L, 13L, 8L, 8L, 8L, 6L, 6L, 6L, 9L, 9L, 9L, 1L, 1L, 1L,
16L, 16L, 16L, 19L, 19L, 19L, 20L, 20L, 20L), .Label = c("",
"0.858568788", "0.910271943", "0.943540215", "0.947846115",
"0.962214947", "0.971821666", "1.062453985", "1.145578504",
"1.162549496", "1.218146205", "1.244680166", "1.347676158",
"14.63914394", "15.85231876", "18.10507202", "20.37916756",
"3.381742954", "50.08181381", "53.58541107", "64.28199768",
"65.58969879", "84.38751984", "RQ Max"), class = "factor"),
C_ = c(25.48042297, 25.4738903, 25.83390617, 25.7304306,
25.78297043, 25.41260529, 25.49670792, 25.52298164, 25.6956234,
25.34812355, 25.51462555, 25.15455437, 0, 0, 0, 32.29237366,
37.10370636, 32.22016525, 29.50172043, 30.18544579, 29.91492081,
25.14842796, 24.89806747, 24.99397278), C_.Mean = c(25.59607506,
25.59607506, 25.59607506, 25.64200401, 25.64200401, 25.64200401,
25.57177162, 25.57177162, 25.57177162, 25.33910179, 25.33910179,
25.33910179, NA, NA, NA, 33.87208176, 33.87208176, 33.87208176,
29.86736107, 29.86736107, 29.86736107, 25.01348877, 25.01348877,
25.01348877), C_.SD = structure(c(21L, 21L, 21L, 20L, 20L,
20L, 12L, 12L, 12L, 19L, 19L, 19L, 1L, 1L, 1L, 31L, 31L,
31L, 23L, 23L, 23L, 14L, 14L, 14L), .Label = c("", "0.039937571",
"0.043110434", "0.049541138", "0.05469643", "0.061177365",
"0.066671595", "0.07365533", "0.079849631", "0.082057081",
"0.095515646", "0.108060829", "0.120047837", "0.126316145",
"0.129658803", "0.130481929", "0.142733917", "0.172286868",
"0.180205062", "0.200392827", "0.205995336", "0.236968249",
"0.344334781", "0.36769405", "0.413046211", "0.445171326",
"0.514641941", "0.640576839", "0.895943522", "0.993181109",
"2.798901796", "C_ SD"), class = "factor"), `_C_` = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", "_C_"), class = "factor"),
`_C_.Mean` = structure(c(8L, 8L, 8L, 5L, 5L, 5L, 4L, 4L,
4L, 7L, 7L, 7L, 1L, 1L, 1L, 3L, 3L, 3L, 13L, 13L, 13L, 14L,
14L, 14L), .Label = c("", "_C_ Mean", "-0.577166259", "-0.68969661",
"-0.720502198", "-0.776381195", "-0.85484314", "-0.96064502",
"-1.058534026", "-2.04822278", "-2.545912504", "-3.293611526",
"-4.921841145", "-6.081196308", "0.477069855", "1.373315215",
"2.092705965", "2.244637728", "2.251055479", "2.346632004",
"2.456220627", "2.557917356", "2.729323149", "2.746313095"
), class = "factor"), `_C_.SE` = structure(c(13L, 13L, 13L,
11L, 11L, 11L, 6L, 6L, 6L, 9L, 9L, 9L, 1L, 1L, 1L, 24L, 24L,
24L, 21L, 21L, 21L, 15L, 15L, 15L), .Label = c("", "_C_ SE",
"0.042180877", "0.042606823", "0.048373949", "0.077573851",
"0.088320434", "0.102536619", "0.108728357", "0.113733612",
"0.117972165", "0.144372106", "0.155044988", "0.223316222",
"0.224465802", "0.258952528", "0.300881863", "0.306413502",
"0.319273174", "0.579304695", "0.606897891", "0.635279417",
"0.682336032", "1.643036604"), class = "factor"), HK.Control._C_.Mean = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", "HK Control _C_ Mean"
), class = "factor"), HK.Control._C_.SE = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", "HK Control _C_ SE"
), class = "factor"), `__C_` = structure(c(12L, 12L, 12L,
16L, 16L, 16L, 18L, 18L, 18L, 13L, 13L, 13L, 1L, 1L, 1L,
19L, 19L, 19L, 7L, 7L, 7L, 10L, 10L, 10L), .Label = c("",
"__C_", "-0.871322632", "-1.61563623", "-3.021018982", "-3.15124011",
"-3.961196184", "-4.140928745", "-4.790550232", "-5.120551586",
"-5.38631773", "0", "0.105801903", "0.15834935", "0.211582825",
"0.240142822", "0.253925949", "0.27094841", "0.383478791",
"0.465211242", "0.484685272", "0.501675308"), class = "factor"),
Automatic.Ct.Threshold = structure(c(3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L), .Label = c("", "Automatic Ct Threshold",
"TRUE"), class = "factor"), Ct.Threshold = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("", "0.056211855",
"0.208910329", "0.693888608", "0.704941193", "Ct Threshold"
), class = "factor"), Automatic.Baseline = structure(c(3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("", "Automatic Baseline",
"TRUE"), class = "factor"), Baseline.Start = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("", "3", "Baseline Start"
), class = "factor"), Baseline.End = structure(c(3L, 3L,
4L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 13L, 14L, 14L, 8L,
12L, 8L, 6L, 7L, 7L, 3L, 3L, 3L), .Label = c("", "21", "22",
"23", "25", "26", "27", "29", "30", "31", "32", "34", "35",
"39", "Baseline End"), class = "factor"), Efficiency = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("", "1", "Efficiency"
), class = "factor"), Comments = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", "Comments"), class = "factor"),
HIGHSD = structure(c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 3L, 3L, 3L, 3L, 3L, 3L
), .Label = c("", "HIGHSD", "N", "Y"), class = "factor"),
NOAMP = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 4L, 4L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("",
"N", "NOAMP", "Y"), class = "factor"), OUTLIERRG = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
4L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("", "N", "OUTLIERRG",
"Y"), class = "factor"), EXPFAIL = structure(c(3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L), .Label = c("", "EXPFAIL", "N", "Y"
), class = "factor")), .Names = c("Well", "Sample.Name",
"Target.Name", "Task", "Reporter", "Quencher", "RQ", "RQ.Min",
"RQ.Max", "C_", "C_.Mean", "C_.SD", "_C_", "_C_.Mean", "_C_.SE",
"HK.Control._C_.Mean", "HK.Control._C_.SE", "__C_", "Automatic.Ct.Threshold",
"Ct.Threshold", "Automatic.Baseline", "Baseline.Start", "Baseline.End",
"Efficiency", "Comments", "HIGHSD", "NOAMP", "OUTLIERRG", "EXPFAIL"
), row.names = c(12L, 13L, 14L, 24L, 25L, 26L, 36L, 37L, 38L,
48L, 49L, 50L, 60L, 61L, 62L, 72L, 73L, 74L, 84L, 85L, 86L, 96L,
97L, 98L), class = "data.frame")
code2 "output":
> dput(`c(8, 9, 10, 21, 22, 23, 33, 34, 35, 46, 47, 48, 58, 59, 60, 73, 74, 75, 85, 86, 87, 97, 98, 99)_slim`)
structure(list(Group.1 = c("Sample2_c", "Sample2_in", "Sample3_c",
"Sample5_in", "Control_c", "Control_in", "Sample5_c", "Sample3_in"
), x = c(33.8720817566667, 25.6420021066667, 29.8673623433333,
25.5717709866667, 0, 25.5960731466667, 25.0134894033333, 25.3391011566667
)), .Names = c("Group.1", "x"), row.names = c(NA, -8L), class = "data.frame")
I don't know if this is really the output because of the given name. But the expected output should be something like that with the correct name: STA_slim
Thank you for your time
First of all, I strongly suggest you avoid assign() in your R code. It's much better to use one of the many mapping/apply function in R to build related data in lists. Using get/assign is sign that you are not doing things in a very R-like way.
Your problem has nothing to do with dplyr really, it's what you are looping over in your loop. When you do
for(i in mget(ls(pattern = "_target"))){
assign(paste0(i, "_slim"),data.frame(i %>% group_by(Sample.Name) %>% summarise(Mean_dC=mean(C__))))
}
that i isn't the name of the data.frame, because you did mget() it's the data frame itself. It doesn't make sense to paste that into a new name.
To "fix" this, you could do
for(i in ls(pattern = "_target")){
assign(paste0(i, "_slim"),data.frame(get(i) %>% group_by(Sample.Name) %>% summarise(Mean_dC=mean(C__))))
}
But even then you don't have a column named C__ in your example data set. You have C_ or _C_ or __C_ (what do these names even mean??). So you'd need to fix that.
The better list way would be
slim <- lapply(mget(ls(pattern = "_target$")) , function(x) {
x %>% group_by(Sample.Name) %>% summarise(Mean_dC=mean(C_))
})

ggplot2 loop graph with conditional subsets

Data description:
I have a data set that is in long format with multiple different grouping variables (in data example: StandID and simID)
What I am trying to do:
I need to create simple scatter plots (x=predicted, y=observed) from this dataset for multiple columns based on a unique grouping variable.
An example of what I am trying to do using just standard plot is
obs=subset(example,simID=="OBS_OBS_OBS")
csfnw=example[example$simID== "CS_F_NW",]
plot(obs$X1HR,csfnw$X1HR)
I would need to do this for all simID and columns 9-14. (12 graphs total from data example)
What I have tried:
The problem I am running into is the y axis needs to remain the same, while cycling through the different subsets for the x axis.
I will admit up front, I have no idea what would be the best approach for this... I thought this would be easy for a split second because the data is already in long format and I would just be pointing to a subset of the data.
1) My original approach was to try and just splice up the data so that each simID had its own data frame, and compare it against the observation dataframe but I don't know how I would then pass it to ggplot.
2) My second idea was to make some kind of makeGraph function containing all the aesthetics I wanted essentially and use some kind of apply on it to pass everything through the function, but I could get neither to work.
makePlot=function(dat,x,y) {
ggplot(data=dat,aes(x=x,y=y))+geom_point(shape=Treat)+theme_bw()
}
What I could get to work was just breaking down the dataframe into the vectors of the variables I would then pass to some kind of loop/apply
sims=levels(example$simID)
sims2=sims[sims != "OBS_OBS_OBS"]
fuel_classes=colnames(example)[9:14]
Thank you
Data example:
example=structure(list(Year = structure(c(7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L), .Label = c("2001", "2002", "2003", "2004", "2005",
"2013", "2014", "2015"), class = "factor"), StandID = structure(c(10L,
2L, 6L, 22L, 14L, 18L, 34L, 26L, 30L, 10L, 2L, 6L, 22L, 14L,
18L, 34L, 26L, 30L, 10L, 2L, 6L, 22L, 14L, 18L, 34L, 26L, 30L
), .Label = c("1NB", "1NC", "1NT", "1NTB", "1RB", "1RC", "1RT",
"1RTB", "1SB", "1SC", "1ST", "1STB", "2NB", "2NC", "2NT", "2NTB",
"2RB", "2RC", "2RT", "2RTB", "2SB", "2SC", "2ST", "2STB", "3NB",
"3NC", "3NT", "3NTB", "3RB", "3RC", "3RT", "3RTB", "3SB", "3SC",
"3ST", "3STB"), class = "factor"), Block = structure(c(1L, 1L,
1L, 2L, 2L, 2L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L,
1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L), .Label = c("1", "2", "3"
), class = "factor"), Aspect = structure(c(3L, 1L, 2L, 3L, 1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L,
3L, 1L, 2L, 3L, 1L, 2L), .Label = c("N", "R", "S"), class = "factor"),
Treat = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L), .Label = c("B", "C", "T", "TB"), class = "factor"),
Variant = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("CS", "OBS", "SN"), class = "factor"),
Fuels = structure(c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L), .Label = c("F", "NF", "OBS"), class = "factor"),
Weather = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("NW", "OBS", "W"), class = "factor"),
X1HR = c(0.321666667, 0.177777778, 0.216111111, 0.280555556,
0.255555556, 0.251666667, 0.296666667, 0.231111111, 0.22,
0.27556628, 0.298042506, 0.440185249, 0.36150676, 0.398630172,
0.367523015, 0.345717251, 0.349305987, 0.412227929, 0.242860824,
0.258737177, 0.394024998, 0.287317872, 0.321927488, 0.281322986,
0.313588411, 0.303123146, 0.383658946), X10HR = c(0.440555556,
0.32, 0.266666667, 0.292222222, 0.496666667, 0.334444444,
0.564444444, 0.424444444, 0.432777778, 0.775042951, 0.832148314,
1.08174026, 1.023838878, 0.976997674, 0.844206274, 0.929837704,
1.0527215, 1.089246511, 0.88642776, 0.920596302, 1.209707737,
1.083737493, 1.077612877, 0.92481339, 1.041637182, 1.149550319,
1.229776621), X100HR = c(0.953888889, 1.379444444, 0.881666667,
1.640555556, 2.321666667, 1.122222222, 1.907777778, 1.633888889,
1.208333333, 1.832724094, 2.149356842, 2.364475727, 2.493232965,
2.262988567, 1.903909683, 2.135747433, 2.256677628, 2.288722038,
1.997704744, 2.087135553, 2.524872541, 2.34671092, 2.338253498,
2.06796217, 2.176314831, 2.580271006, 2.857197046), X1000HR = c(4.766666667,
8.342222222, 3.803333333, 8.057777778, 10.11444444, 6.931111111,
6.980555556, 13.20611111, 1.853333333, 3.389177084, 4.915714741,
2.795267582, 2.48227787, 2.218413353, 1.64684248, 2.716156483,
2.913746119, 2.238629341, 3.449863434, 3.432626724, 3.617531776,
3.641639471, 3.453454971, 3.176793337, 3.459602833, 3.871166945,
2.683447838), LITTER = c(2.4, 2.219444444, 2.772222222, 2.596666667,
2.693888889, 2.226111111, 2.552222222, 3.109444444, 2.963333333,
2.882233381, 3.025934696, 3.174396992, 3.291081667, 2.897673607,
2.737119675, 2.987895727, 3.679605484, 2.769756079, 2.882241249,
3.02594161, 3.174404144, 3.291091681, 2.897681713, 2.737129688,
2.987901449, 3.679611444, 2.769766569), DUFF = c(1.483333333,
1.723888889, 0.901666667, 1.520555556, 1.49, 1.366111111,
0.551666667, 1.056111111, 0.786111111, 2.034614563, 2.349547148,
1.685223818, 2.301301956, 2.609308243, 2.21895647, 2.043699026,
2.142618418, 0.953421116, 4.968493462, 4.990526676, 5.012362003,
5.023665905, 4.974074364, 4.947199821, 4.976779461, 5.082509995,
3.55211544), simID = structure(c(5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L), .Label = c("CS_F_NW", "CS_F_W",
"CS_NF_NW", "CS_NF_W", "OBS_OBS_OBS", "SN_F_NW", "SN_F_W",
"SN_NF_NW", "SN_NF_W"), class = "factor")), .Names = c("Year",
"StandID", "Block", "Aspect", "Treat", "Variant", "Fuels", "Weather",
"X1HR", "X10HR", "X100HR", "X1000HR", "LITTER", "DUFF", "simID"
), row.names = c(37L, 38L, 39L, 40L, 41L, 42L, 43L, 44L, 45L,
82L, 83L, 84L, 85L, 86L, 87L, 88L, 89L, 90L, 127L, 128L, 129L,
130L, 131L, 132L, 133L, 134L, 135L), class = "data.frame")
You were actually on the right track. If all plots are the same, just make one function and then use loops to loop over the subsets. For your example this can be done like this:
library(ggplot2)
# the plot function
plotFun = function(dat, title) {
ggplot(data=dat) +
geom_point(aes(x = x, y = y), shape=18) +
ggtitle(title) +
theme_bw()
}
# columns of interest
colIdx = 9:14
# split on all values of simID
dfList = split(example, example$simID)
# simID has never appearing factors. These are removed
dfList = dfList[lapply(dfList, nrow) != 0]
# make empty array for saving plots
plotList = array(list(), dim = c(length(dfList), length(dfList), length(colIdx)),
dimnames = list(names(dfList), names(dfList), names(example)[colIdx]))
# the first two loops loop over all unique combinations of dfList
for (i in 2:length(dfList)) {
for (j in 1:(i-1)) {
# loop over target variables
for (k in seq_along(colIdx)) {
# store variables to plot in a temporary dataframe
tempDf = data.frame(x = dfList[[i]][, colIdx[k]],
y = dfList[[j]][, colIdx[k]])
# add a title so we can see in the plot what is plotted vs what
title = paste0(names(dfList)[i], ":", names(dfList[[i]])[colIdx[k]], " VS ",
names(dfList)[j], ":", names(dfList[[j]])[colIdx[k]])
# make and save plot
plotList[[i, j, k]] = plotFun(tempDf, title)
}
}
}
# call the plots like this
plotList[[2, 1, 4]]
# Note that we only filled the lower triangle of combinations
# therefore indexing with [[1, 1, 1]] just returns NULL
plotList[, , 1]
This process can probably be more optimized, but when creating graphs I would go for clarity above speed since speed usually isn't an issue.

Reshape a large matrix with missing values and multiple vars of interest [duplicate]

This question already has answers here:
Convert data from long format to wide format with multiple measure columns
(6 answers)
Closed 4 years ago.
I need to reorganize a large dataset into a specific format for further analysis. Right now the data are in long format, with multiple records through time for each point. I need to reshape the data so that each point has a single record, but it will add many new columns of the time-specific data. I’ve looked at previous similar posts but I need to ultimately convert several of the current variables into columns, and I can’t find an example of such. Is there a way to accomplish this in a single reshape, or will I have to do several and then concatenate the new columns back together? Another wrinkle before I post the example is that not all points were sampled at each time-step, so I need those values to show up as NA. For example, (see data below) SitePoint A1 was not sampled at all in 2012, SitePoint A10 was not sampled during the first round in 2012, but K83 was sampled all nine times.
mydatain <- structure(list(SitePoint = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 6L, 6L), .Label = c("A1", "A10", "K145", "K83", "T15",
"T213"), class = "factor"), Year_Rotation = structure(c(1L, 2L,
3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 8L, 9L, 1L, 2L, 4L, 5L,
6L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 1L, 7L), .Label = c("2010_1", "2010_2",
"2010_3", "2011_1", "2011_2", "2011_3", "2012_1", "2012_2", "2012_3"
), class = "factor"), MR_Fire = structure(c(5L, 6L, 6L, 2L, 9L,
9L, 5L, 6L, 6L, 2L, 9L, 9L, 7L, 8L, 16L, 17L, 21L, 22L, 23L,
25L, 3L, 4L, 10L, 11L, 12L, 13L, 14L, 15L, 18L, 19L, 20L, 1L,
2L, 2L, 5L, 6L, 6L, 11L, 11L, 12L, 7L, 24L), .Label = c("0",
"1", "10", "11", "12", "13", "14", "15", "2", "23", "24", "25",
"35", "36", "37", "39", "40", "47", "48", "49", "51", "52", "53",
"8", "9"), class = "factor"), fire_seas = structure(c(2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L,
1L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 3L), .Label = c("dry", "fire", "wet"
), class = "factor"), OptTSF = c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L,
1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L,
0L, 1L, 1L)), .Names = c("SitePoint", "Year_Rotation", "MR_Fire",
"fire_seas", "OptTSF"), row.names = c(31L, 32L, 33L, 34L, 35L,
36L, 67L, 68L, 69L, 70L, 71L, 72L, 73L, 74L, 10543L, 10544L,
10545L, 10546L, 10547L, 10548L, 10549L, 10550L, 14988L, 14989L,
14990L, 14991L, 14992L, 14993L, 14994L, 14995L, 14996L, 17370L,
17371L, 17372L, 17373L, 17374L, 17375L, 17376L, 17377L, 17378L,
19353L, 19354L), class = "data.frame")
Ultimately I need something like this:
myfinal <- structure(list(SitePoint = structure(1:6, .Label = c("A1", "A10",
"K145", "K83", "T15", "T213"), class = "factor"), MR_Fire_2010_1 = c(12L,
12L, 39L, 23L, 0L, 14L), MR_Fire_2010_2 = c(13L, 13L, 40L, 24L,
1L, NA), MR_Fire_2010_3 = c(13L, 13L, NA, 25L, 1L, NA), MR_Fire_2011_1 = c(1L,
1L, 51L, 35L, 12L, NA), MR_Fire_2011_2 = c(2L, 2L, 52L, 36L,
13L, NA), MR_Fire_2011_3 = c(2L, 2L, 53L, 37L, 13L, NA), MR_Fire_2012_1 = c(NA,
NA, 9L, 47L, 24L, 8L), MR_Fire_2012_2 = c(NA, 14L, 10L, 48L,
24L, NA), MR_Fire_2012_3 = c(NA, 15L, 11L, 49L, 25L, NA), season_2010_1 = structure(c(2L,
2L, 1L, 2L, 2L, 1L), .Label = c("dry", "fire"), class = "factor"),
season_2010_2 = structure(c(2L, 2L, 1L, 2L, 2L, NA), .Label = c("dry",
"fire"), class = "factor"), season_2010_3 = structure(c(1L,
1L, NA, 1L, 1L, NA), .Label = "fire", class = "factor"),
season_2011_1 = structure(c(2L, 2L, 1L, 2L, 2L, NA), .Label = c("dry",
"fire"), class = "factor"), season_2011_2 = structure(c(2L,
2L, 1L, 2L, 2L, NA), .Label = c("dry", "fire"), class = "factor"),
season_2011_3 = structure(c(2L, 2L, 1L, 2L, 2L, NA), .Label = c("dry",
"fire"), class = "factor"), season_2012_1 = structure(c(NA,
NA, 2L, 1L, 1L, 2L), .Label = c("fire", "wet"), class = "factor"),
season_2012_2 = structure(c(NA, 1L, 2L, 1L, 1L, NA), .Label = c("fire",
"wet"), class = "factor"), season_2012_3 = structure(c(NA,
1L, 2L, 1L, 1L, NA), .Label = c("fire", "wet"), class = "factor"),
OptTSF_2010_1 = c(1L, 1L, 0L, 1L, 1L, 1L), OptTSF_2010_2 = c(1L,
1L, 0L, 1L, 1L, NA), OptTSF_2010_3 = c(1L, 1L, NA, 1L, 1L,
NA), OptTSF_2011_1 = c(1L, 1L, 0L, 0L, 1L, NA), OptTSF_2011_2 = c(1L,
1L, 0L, 0L, 1L, NA), OptTSF_2011_3 = c(1L, 1L, 0L, 0L, 1L,
NA), OptTSF_2012_1 = c(NA, NA, 1L, 0L, 0L, 1L), OptTSF_2012_2 = c(NA,
1L, 1L, 0L, 0L, NA), OptTSF_2012_3 = c(NA, 1L, 1L, 0L, 0L,
NA)), .Names = c("SitePoint", "MR_Fire_2010_1", "MR_Fire_2010_2",
"MR_Fire_2010_3", "MR_Fire_2011_1", "MR_Fire_2011_2", "MR_Fire_2011_3",
"MR_Fire_2012_1", "MR_Fire_2012_2", "MR_Fire_2012_3", "season_2010_1",
"season_2010_2", "season_2010_3", "season_2011_1", "season_2011_2",
"season_2011_3", "season_2012_1", "season_2012_2", "season_2012_3",
"OptTSF_2010_1", "OptTSF_2010_2", "OptTSF_2010_3", "OptTSF_2011_1",
"OptTSF_2011_2", "OptTSF_2011_3", "OptTSF_2012_1", "OptTSF_2012_2",
"OptTSF_2012_3"), class = "data.frame", row.names = c(NA, -6L
))
The actual dataset is about 23656 records X 15 variables, so doing it by hand is likely to cause major headaches and potential for mistakes. Any help or suggestions are appreciated. If this has been answered elsewhere, apologies. I couldn’t find anything directly applicable; everything seemed to related to three columns and only one of those being extracted as new variables. Thanks.
SP
dcast from the devel version of data.table i.e., v1.9.5 can cast multiple columns simultaneously. It can be installed from here.
library(data.table) ## v1.9.5+
dcast(setDT(mydatain), SitePoint~Year_Rotation,
value.var=c('MR_Fire', 'fire_seas', 'OptTSF'))
You can use reshape to change the structure of your dataframe from long to wide using the following code:
reshape(mydatain,timevar="Year_Rotation",idvar="SitePoint",direction="wide")

R: How to aggregate data into percentages without missing data for stacked-bar plot in ggplot2?

I would like to summarize my "karyotype" molecular data by location and substrate (see sample data below) as percentages in order to create a stack-bar plot in ggplot2.
I have figured out how to use 'dcast' to get a total for each karyotype, but cannot figure out how to get a percent for each of the three karyotypes (i.e. 'BB', 'BD', 'DD').
The data should be in a format to make a stacked bar plot in 'ggplot2'.
Sample Data:
library(reshape2)
Karotype.Data <- structure(list(Location = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L), .Label = c("Kampinge", "Kaseberga", "Molle", "Steninge"
), class = "factor"), Substrate = structure(c(1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L,
2L, 2L, 2L, 2L, 2L), .Label = c("Kampinge", "Kaseberga", "Molle",
"Steninge"), class = "factor"), Karyotype = structure(c(1L, 3L,
4L, 4L, 3L, 3L, 4L, 4L, 4L, 3L, 1L, 4L, 3L, 4L, 4L, 3L, 1L, 4L,
3L, 3L, 4L, 3L, 4L, 3L, 3L), .Label = c("", "BB", "BD", "DD"), class = "factor")), .Names = c("Location",
"Substrate", "Karyotype"), row.names = c(135L, 136L, 137L, 138L,
139L, 165L, 166L, 167L, 168L, 169L, 236L, 237L, 238L, 239L, 240L,
326L, 327L, 328L, 329L, 330L, 426L, 427L, 428L, 429L, 430L), class = "data.frame")
## Summary count for each karoytype ##
Karyotype.Summary <- dcast(Karotype.Data , Location + Substrate ~ Karyotype, value.var="Karyotype", length)
You can use the dplyr package:
library(dplyr)
z.counts <- Karotype.Data %>%
group_by(Location,Substrate,Karyotype) %>%
summarize(freq=n())
z.freq <- z.counts %>%
group_by(Location,Substrate) %>%
mutate(freq=freq/sum(freq)*100)
Here, the data remain in the long format, so it is straightforward to build the barplot with ggplot:
library(ggplot2)
ggplot(z.freq) +
aes(x=Karyotype,y=freq) +
facet_grid(Location~Substrate) +
geom_bar(stat='identity')
With some help from 'Marat Talipov' and many other answers to questions on Stackoverflow I found out that it is important to load 'plyr' before 'dplyr' and to use 'summarise' rather than 'summarize'. Then removing the missing data was the last step using 'filter'.
library(dplyr)
z.counts <- Karotype.Data %>%
group_by(Location,Substrate,Karyotype) %>%
summarise(freq=n())
z.freq <- z.counts %>% filter(Karyotype != '') %>%
group_by(Location,Substrate) %>%
mutate(freq=freq/sum(freq))
z.freq
library (ggplot2)
ggplot(z.freq, aes(x=Substrate, y=freq, fill=Karyotype)) +
geom_bar(stat="identity") +
facet_wrap(~ Location)
Now I have created the plot I was looking for:

Resources