Related
I have fish count data and am trying to create a new dataframe using averages of the measurements based on conditions of two different columns. here is my data:
df <- structure(list(SITE = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L,
3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L,
2L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 1L, 1L, 1L,
2L, 2L, 2L, 3L, 3L, 3L), .Label = c("1", "2", "3"), class = "factor"),
ZONE = structure(c(5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L), .Label = c("CREST", "INNER_FLAT", "MID_FLAT",
"OUTER_FLAT", "SLOPE"), class = "factor"), C_TOTAL = c(44L,
7L, 20L, 14L, 0L, 4L, 2L, 3L, 1L, 8L, 28L, 24L, 31L, 12L,
33L, 6L, 16L, 33L, 75L, 21L, 60L, 81L, 37L, 89L, 21L, 35L,
71L, 5L, 2L, 0L, 0L, 10L, 23L, 0L, 5L, 11L, 3L, 1L, 5L, 0L,
0L, 8L, 7L, 6L, 42L), C_M2 = c(0.210465706, 0.029861994,
0.090324177, 0.066599319, 0, 0.022092452, 0.011750593, 0.015245519,
0.004710433, 0.033111594, 0.155094195, 0.110576495, 0.193659068,
0.059152822, 0.192379108, 0.047800772, 0.08917095, 0.141336411,
0.402538785, 0.130438337, 0.315206235, 0.460746849, 0.278643938,
0.467754275, 0.192830321, 0.119928472, 0.411502497, 0.015370489,
0.005150184, 0, 0, 0.034651441, 0.067824733, 0, 0.009805851,
0.034844309, 0.010614352, 0.004131048, 0.01850898, 0, 0,
0.029195413, 0.021409016, 0.030498145, 0.172406074), TRANS_A = c(209.0601875,
234.411677, 221.4246571, 210.2123593, 226.6158348, 181.0573136,
170.2041767, 196.7791332, 212.294701, 241.6072127, 180.5354478,
217.0443184, 160.0751279, 202.8643689, 171.536298, 125.5209863,
179.4306337, 233.485481, 186.3174499, 160.9956132, 190.3515643,
175.801528, 132.7859497, 190.2708425, 108.9040348, 291.8406241,
172.5384427, 325.2986863, 388.3356059, 303.1957479, 261.1574528,
288.5882879, 339.1093313, 239.1118021, 509.89965, 315.6899993,
282.6362022, 242.0693453, 270.1391425, 294.8864591, 321.2013381,
274.0156514, 326.9650539, 196.7332763, 243.6109069), SCARID_T = c(35L,
4L, 4L, 13L, 0L, 4L, 2L, 0L, 1L, 4L, 20L, 12L, 17L, 5L, 20L,
6L, 6L, 18L, 63L, 11L, 41L, 75L, 34L, 89L, 14L, 33L, 68L,
0L, 0L, 0L, 0L, 10L, 22L, 0L, 0L, 10L, 0L, 0L, 1L, 0L, 0L,
6L, 0L, 4L, 42L), ACAN_T = c(4L, 0L, 11L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 8L, 5L, 0L, 0L, 0L, 0L, 3L, 2L, 7L, 8L, 8L, 1L,
1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 5L, 0L, 0L,
0L, 1L, 0L, 0L, 2L, 0L, 0L, 0L), SIG_T = c(5L, 3L, 5L, 1L,
0L, 0L, 0L, 3L, 0L, 4L, 0L, 7L, 14L, 7L, 13L, 0L, 7L, 13L,
5L, 2L, 11L, 5L, 2L, 0L, 7L, 1L, 3L, 5L, 2L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 3L, 1L, 3L, 0L, 0L, 0L, 7L, 2L, 0L)), row.names = c(NA,
-45L), class = "data.frame")
I want to average all the measurements by each zone, but also according to site. So I want anew data frame where each site has one measurement for each zone.
Can anyone help me? Thanks!
library(dplyr)
df %>%
group_by(SITE, ZONE) %>%
summarise(
across(where(is.numeric), mean)
)
# A tibble: 15 x 8
# Groups: SITE [3]
SITE ZONE C_TOTAL C_M2 TRANS_A SCARID_T ACAN_T SIG_T
<fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 CREST 20 0.0996 213. 12 4.33 3.67
2 1 INNER_FLAT 3 0.0111 265. 0.333 0.333 2.33
3 1 MID_FLAT 2.33 0.00684 339. 0 0 2.33
4 1 OUTER_FLAT 52 0.283 179. 38.3 7.67 6
5 1 SLOPE 23.7 0.110 222. 14.3 5 4.33
6 2 CREST 25.3 0.148 178. 14 0 11.3
7 2 INNER_FLAT 2.67 0.00973 297. 2 0.667 0
8 2 MID_FLAT 11 0.0342 296. 10.7 0.333 0
9 2 OUTER_FLAT 69 0.402 166. 66 0.667 2.33
10 2 SLOPE 6 0.0296 206. 5.67 0 0.333
11 3 CREST 18.3 0.0928 179. 10 1.67 6.67
12 3 INNER_FLAT 18.3 0.0748 256. 15.3 0 3
13 3 MID_FLAT 5.33 0.0149 355. 3.33 1.67 0.333
14 3 OUTER_FLAT 42.3 0.241 191. 38.3 0.333 3.67
15 3 SLOPE 2 0.0106 193. 1 0 1
I would like to plot simple learning curves. My data looks like this:
id trial type choice
1 1 A 0
1 2 A 1
2 1 B 1
2 2 B 0
structure(list(id = c(2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L,
4L, 4L, 4L, 4L, 4L, 6L, 6L, 6L, 6L, 6L), trial = c(1L, 2L, 3L,
4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L,
5L), choice = c(0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 0L,
0L, 0L, 1L, 0L, 0L, 0L, 1L, 1L), type = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L), .Label = c("A", "A3", "B"), class = "factor")), row.names = c(1L,
2L, 3L, 4L, 5L, 31L, 32L, 33L, 34L, 35L, 61L, 62L, 63L, 64L,
65L, 91L, 92L, 93L, 94L, 95L), class = "data.frame")
ID, Trial and Type are integers and Choice is a factor. I would like to plot the choice the different groups have made per trial. How I imagine the graph (a 1 in the vector choice is consider correct):
The smoothness of the curves is an exaggeration.
I would also like to know how can I do calculations by coupling groups. For example, sum all the choices of group A during trials 1 to 10.
Thank you for your help!
Basically you want to summarize your data first, then plot it. You can do this easily with dplyr and ggplot2 for example if your data is stored in a data.frame named dd
library(dplyr)
library(ggplot2)
dd %>%
group_by(type, trial) %>%
summarize(correct=mean(choice)) %>%
ggplot() +
geom_line(aes(trial, correct, color=type))
For each type and trial we calculate the mean value of choice to get the percent of people who answered correctly. Then we plot that value for each trial with a line that's colored by the type.
I have a dataframe df with following information:
df <- structure(list(Samples = structure(c(1L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 2L, 1L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 2L, 1L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 2L, 1L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 2L), .Label = c("Sample1", "Sample10", "Sample2",
"Sample3", "Sample4", "Sample5", "Sample6", "Sample7", "Sample8",
"Sample9"), class = "factor"), patient.vital_status = c(0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 0L, 1L), years = c(3.909589041, 1.457534247,
2.336986301, 5.010958904, 1.665753425, 1.81369863, 1.191780822,
4.687671233, 2.167123288, 1.95890411, 3.909589041, 1.457534247,
2.336986301, 5.010958904, 1.665753425, 1.81369863, 1.191780822,
4.687671233, 2.167123288, 1.95890411, 3.909589041, 1.457534247,
2.336986301, 5.010958904, 1.665753425, 1.81369863, 1.191780822,
4.687671233, 2.167123288, 1.95890411, 3.909589041, 1.457534247,
2.336986301, 5.010958904, 1.665753425, 1.81369863, 1.191780822,
4.687671233, 2.167123288, 1.95890411), Genes = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L), .Label = c("A1BG", "A1CF", "A2M",
"A2ML1"), class = "factor"), value = c(0.034459012, 0.017698878,
0.023313851, 0.010456762, 0.032674019, 0.037561831, 0.03380681,
0, 0.019954956, 0.012392427, 0.835801613, 2.265192447, 2.431409095,
5.012117956, 2.139962802, 2.371946704, 4.555234385, 0.550293401,
0.924012327, 2.274642129, 92.85639578, 79.50897642, 23.72187602,
26.86025304, 32.80504253, 222.6449054, 71.78812505, 45.76371588,
29.93976676, 22.97515484, 0.03780441, 0.005825143, 0, 0.002867985,
0.011948708, 0.02060423, 0.004636111, 0.015903347, 0.005473063,
0.033988816)), class = "data.frame", row.names = c(NA, -40L))
I want to loop over the information based on the columns Genes and value and get a result. And again I want the result to be added to the dataframe df. The result will be with low or high.
I'm trying to do this with the following code, but it doesn't work:
genes <- as.character(unique(df$Genes))
library(survival)
library(survminer)
for(i in genes){
surv_rnaseq.cut <- surv_cutpoint(
df,
time = "years",
event = "patient.vital_status",
variables = c("Genes","value"))
df$cat <- surv_categorize(surv_rnaseq.cut)
}
Along with the above result I also wanted the summary for surv_rnaseq.cut for all the four genes with mentioning its name.
Any help please. thanq
An option would be to split by 'genes' (group_split), loop over the list, apply the functions and bind the list elements after creating the column
library(survminer)
library(survival)
library(dplyr)
library(purrr)
df %>%
group_split(Genes) %>%
map_dfr(~ surv_cutpoint(.x,
time = "years",
event = "patient.vital_status",
variables = c("Genes", "value")) %>%
surv_categorize %>%
pull(value) %>%
mutate(.x, cat = .))
# A tibble: 40 x 6
# Samples patient.vital_status years Genes value cat
# <fct> <int> <dbl> <fct> <dbl> <chr>
# 1 Sample1 0 3.91 A1BG 0.0345 high
# 2 Sample2 0 1.46 A1BG 0.0177 high
# 3 Sample3 0 2.34 A1BG 0.0233 high
# 4 Sample4 0 5.01 A1BG 0.0105 high
# 5 Sample5 0 1.67 A1BG 0.0327 high
# 6 Sample6 0 1.81 A1BG 0.0376 high
# 7 Sample7 0 1.19 A1BG 0.0338 high
# 8 Sample8 1 4.69 A1BG 0 low
# 9 Sample9 0 2.17 A1BG 0.0200 high
#10 Sample10 1 1.96 A1BG 0.0124 high
# … with 30 more rows
I have data frame like this dummy sample, my real dataset had 56 variables.
I would like to drop the date and aggregate by id and sum last 4 total variables while keep the other unchanged.
df <- data.frame(stringsAsFactors=FALSE,
date = c("2019-02-10", "2019-02-10", "2019-02-11", "2019-02-11",
"2019-02-12", "2019-02-12", "2019-02-13", "2019-02-13",
"2019-02-14", "2019-02-14"),
id = c("18100410-aa", "18101080-ae", "18100410-aa", "18101080-ae",
"18100410-aa", "18101080-ae", "18100410-aa", "18101080-ae",
"18100410-aa", "18101080-ae"),
f_type = c(4L, 2L, 4L, 2L, 4L, 2L, 4L, 2L, 4L, 2L),
reg = c(6L, 7L, 6L, 7L, 6L, 7L, 6L, 7L, 6L, 7L),
hh_p10 = c(2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L),
internet = c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L),
youngest = c(5L, 7L, 5L, 7L, 5L, 7L, 5L, 7L, 5L, 7L),
a_group = c(3L, 6L, 3L, 6L, 3L, 6L, 3L, 6L, 3L, 6L),
total_prd = c(130L, 337L, 374L, 261L, 106L, 230L, 150L, 36L, 15L, 123L),
B_totalprod = c(20L, 0L, 256L, 0L, 32L, 0L, 0L, 36L, 0L, 45L),
p_totalprod = c(0L, 81L, 11L, 260L, 26L, 230L, 0L, 0L, 15L, 0L),
n_totalprod = c(110L, 256L, 107L, 1L, 48L, 0L, 150L, 0L, 0L, 78L)
)
I found this solution from plyr package here it is working but I need to specify all my 52 unaffected variables. I am just wondering is there any other way to do this task?
library(plyr)
ddply(df,.(id,f_type, reg, internet,hh_p10 ,youngest, a_group ),summarise,total_prd = sum(total_prd) ,
B_totalprod = sum(B_totalprod) , p_totalprod = sum(p_totalprod) ,
n_totalprod = sum(n_totalprod))
If your real dataset also has columns that contain "total" this should work:
library(tidyverse)
df %>%
select(-date) %>%
group_by(.dots = str_subset(names(.), "total", negate = TRUE)) %>%
summarise_all(list(sum = sum))
# A tibble: 2 x 11
# Groups: id, f_type, reg, hh_p10, internet, youngest [2]
id f_type reg hh_p10 internet youngest a_group total_prd_sum B_totalprod_sum p_totalprod_sum n_totalprod_sum
<chr> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1 18100410-aa 4 6 2 1 5 3 775 308 52 415
2 18101080-ae 2 7 1 2 7 6 987 81 571 335
The line group_by(.dots = str_subset(names(.), "total", negate = TRUE)) means we are going to group by all the column names in our this dataset that do not contain the word "total".
I've made a few experiments and each experiment led to the apparition of color.
As I can't do more experiments, I want to sample by size=30 and see what frequency table (of colors) I could obtain for 1000 sampling. The resulting frequency table should be the sum of the 1000 frequency table.
I think about concatenating table as follows and try to agregate, but it did not work:
mydata=structure(list(Date = structure(c(11L, 1L, 9L, 9L, 10L, 1L, 2L,
3L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 6L, 7L, 4L, 4L, 4L, 6L, 6L, 11L,
5L, 4L, 7L, 10L, 6L, 6L, 2L, 5L, 7L, 11L, 1L, 9L, 11L, 11L, 11L,
1L, 1L), .Label = c("01/02/2016", "02/02/2016", "03/02/2016",
"08/02/2016", "10/02/2016", "11/02/2016", "16/02/2016", "22/02/2016",
"26/01/2016", "27/01/2016", "28/01/2016"), class = "factor"),
Color = structure(c(30L, 33L, 11L, 1L, 18L, 18L, 11L,
16L, 19L, 19L, 22L, 1L, 18L, 18L, 13L, 14L, 13L, 18L, 24L,
24L, 11L, 24L, 2L, 33L, 25L, 1L, 30L, 5L, 24L, 18L, 13L,
35L, 19L, 19L, 18L, 23L, 19L, 8L, 19L, 14L), .Label = c("ARD",
"ARP", "BBB", "BIE", "CFX", "CHR", "DDD", "DOO", "EAU", "ELY",
"EPI", "ETR", "GEN", "GER", "GGG", "GIS", "ISE", "JUV", "LER",
"LES", "LON", "LYR", "MON", "NER", "NGY", "NOJ", "NYO", "ORI",
"PEO", "RAY", "RRR", "RSI", "SEI", "SEP", "VIL", "XQU", "YYY",
"ZYZ"), class = "factor"), Categorie = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1", "1,2", "1,2,3",
"1,3", "2", "2,3", "3", "4", "5"), class = "factor"), Portion_Longueur = c(3L,
4L, 1L, 1L, 2L, 4L, 5L, 6L, 7L, 7L, 8L, 8L, 9L, 8L, 8L, 9L,
11L, 7L, 7L, 7L, 9L, 8L, 3L, 8L, 7L, 11L, 2L, 9L, 8L, 5L,
8L, 12L, 3L, 4L, 1L, 3L, 3L, 3L, 4L, 5L)), .Names = c("Date",
"Color", "Categorie", "Portion_Longueur"), row.names = c(NA,
40L), class = "data.frame")
for (i in 1:1000) {
mysamp= sample(mydata$Color,size=30)
x=data.frame(table(mysamp))
if (i==1) w=x
else w <- c(w, x)
}
aggregate(w$Freq, by=list(Color=w$mysamp), FUN=sum)
Example, for 3 sampling, for (i in 1:3) I expect have sum as follow :
But I do not have Sum, instead I have:
Color x
1 ARD 2
2 ARP 1
3 BBB 0
4 BIE 0
5 CFX 0
6 CHR 0
7 DDD 0
8 DOO 1
9 EAU 0
10 ELY 0
11 EPI 3
12 ETR 0
13 GEN 2
14 GER 2
15 GGG 0
16 GIS 1
17 ISE 0
18 JUV 4
19 LER 5
20 LES 0
21 LON 0
22 LYR 1
23 MON 1
24 NER 2
25 NGY 1
26 NOJ 0
27 NYO 0
28 ORI 0
29 PEO 0
30 RAY 1
31 RRR 0
32 RSI 0
33 SEI 2
34 SEP 0
35 VIL 1
36 XQU 0
37 YYY 0
38 ZYZ 0
How to do this ?
Thanks a lot
Your for loop is what's causing your issues. You end up creating a big list that is somewhat difficult to perform calculations on (check out names(w) to see what I mean). A better data structure would allow for easier calculations:
x = NULL #initialize
for (i in 1:1000) {
mysamp = sample(mydata$Color,size=30) #sample
mysamp = data.frame(table(mysamp)) #frequency
x = rbind(x, mysamp) #bind to x
}
aggregate(Freq~mysamp, data = x, FUN = sum) #perform calculation
Note that this loop runs a bit slower than your loop. This is because of the rbind() function. See this post. Maybe someone will come along with a more efficient solution.