How to solve a barplot with myltiple error lines using ggplot? - r

I am having some issues plotting this data. I try to replicate the code in enter link description here to process the data.
Treatment TIME N len sd se
1 M1 4 44,025 2,35990819 1,179954095
1 M1 4 43,45 2,653927907 1,326963953
1 M1 4 39,825 2,681262141 1,34063107
1 M1 4 43,975 5,341894171 2,670947085
1 M1 4 41,375 3,096637962 1,548318981
1 M1 4 43,425 2,547384279 1,27369214
1 M1 4 39,45 2,598076211 1,299038106
1 M1 4 41,05 3,511409973 1,755704987
1 M1 4 40,925 1,77270979 0,886354895
1 M1 4 40,075 3,237668915 1,618834457
1 M1 4 42,375 2,758471799 1,3792359
2 M1 4 40,975 4,560975773 2,280487886
2 M1 4 40,55 3,660145717 1,830072858
2 M1 4 36,975 3,358943286 1,679471643
2 M1 4 43,175 1,64797856 0,82398928
2 M1 4 36,45 5,453744891 2,726872445
2 M1 4 41,2 2,246478726 1,123239363
2 M1 4 42,7 4,48924641 2,244623205
2 M1 4 39,5 2,759226945 1,379613472
2 M1 4 44,375 6,335810919 3,167905459
2 M1 4 42,75 1,721433511 0,860716756
2 M1 4 40,85 1,707825128 0,853912564
3 M1 4 45,975 0,699404509 0,349702254
3 M1 4 44,2 3,03644529 1,518222645
3 M1 4 42,6 4,429446918 2,214723459
3 M1 4 45,55 5,269092268 2,634546134
3 M1 4 46,525 2,022168803 1,011084401
3 M1 4 45,675 3,597568623 1,798784312
3 M1 4 47,075 2,46221445 1,231107225
3 M1 4 47,3 0,783156008 0,391578004
3 M1 4 42,025 2,639917928 1,319958964
3 M1 4 49,05 5,382997926 2,691498963
3 M1 4 48,25 4,591659105 2,295829552
1 M2 4 216,5 5,066228051 2,533114026
1 M2 4 205,75 4,991659711 2,495829855
1 M2 4 210,75 11,8988795 5,94943975
1 M2 4 204,75 23,41473895 11,70736947
1 M2 4 198,75 6,396613687 3,198306844
1 M2 4 219,75 8,732124598 4,366062299
1 M2 4 195,75 16,56049516 8,280247581
1 M2 4 219,75 7,719024118 3,859512059
1 M2 4 197,5 5,259911279 2,62995564
1 M2 4 216,25 8,995369179 4,49768459
1 M2 4 212 12,4365054 6,218252702
2 M2 4 210,25 7,041543391 3,520771696
2 M2 4 214,25 16,31716887 8,158584436
2 M2 4 208,75 9,708243919 4,85412196
2 M2 4 220,75 16,17353806 8,086769029
2 M2 4 218 30,62678566 15,31339283
2 M2 4 234 40,02499219 20,0124961
2 M2 4 217,5 5,567764363 2,783882181
2 M2 4 214,25 12,28481447 6,142407237
2 M2 4 207 13,6381817 6,819090848
2 M2 4 210,25 8,578072822 4,289036411
2 M2 4 202,75 11,52894907 5,764474535
3 M2 4 98,75 19,92276755 9,961383773
3 M2 4 101,25 10,04572878 5,022864389
3 M2 4 96,75 14,43086969 7,215434845
3 M2 4 110,5 18,06469854 9,03234927
3 M2 4 102,25 4,031128874 2,015564437
3 M2 4 109 20,54263858 10,27131929
3 M2 4 114 14,49137675 7,245688373
3 M2 4 116,25 12,71154331 6,355771655
3 M2 4 90,75 61,74881915 30,87440958
3 M2 4 123,5 26,78930135 13,39465067
3 M2 4 132,75 27,54844218 13,77422109
1 M3 4 249,75 26,06881918 13,03440959
1 M3 4 268,75 21,8384218 10,9192109
1 M3 4 241,25 27,80137886 13,90068943
1 M3 4 232,25 26,107151 13,0535755
1 M3 4 271,5 20,63169083 10,31584542
1 M3 4 277,25 26,77529956 13,38764978
1 M3 4 242 12,75408431 6,377042157
1 M3 4 260 19,4422221 9,721111048
1 M3 4 256,25 23,8100119 11,90500595
1 M3 4 254 4,898979486 2,449489743
1 M3 4 250,25 13,72042273 6,860211367
2 M3 4 256,75 16,58061117 8,290305583
2 M3 4 264,5 26,71454036 13,35727018
2 M3 4 246,5 14,10673598 7,05336799
2 M3 4 266 17,64464036 8,822320179
2 M3 4 266,25 24,87803583 12,43901791
2 M3 4 266,75 16,17353806 8,086769029
2 M3 4 247,25 51,93184636 25,96592318
2 M3 4 258,25 37,93305507 18,96652753
2 M3 4 238,5 65,26612189 32,63306095
2 M3 4 260 19,8158186 9,907909298
2 M3 4 248,75 27,80137886 13,90068943
3 M3 4 108,75 26,65051594 13,32525797
3 M3 4 106,25 17,05627939 8,528139696
3 M3 4 109,25 17,93274472 8,966372362
3 M3 4 120,5 20,48576742 10,24288371
3 M3 4 107,25 2,5 1,25
3 M3 4 129 23,98610709 11,99305355
3 M3 4 131,5 15,75859554 7,879297769
3 M3 4 143,75 19,87251033 9,936255163
3 M3 4 117,75 80,267781 40,1338905
3 M3 4 139,5 36,24453982 18,12226991
3 M3 4 154,75 25,61737691 12,80868846
Here the example to be reproduced:
alt2 <- structure(list(Treatment = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3), measurement = c("D", "D", "D", "D", "D",
"D", "D", "D", "D", "D", "F", "F", "F", "F", "F", "F", "F", "F",
"F", "F", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "F",
"F", "F", "F", "F", "F", "F", "F", "F", "F", "D", "D", "D", "D",
"D", "D", "D", "D", "D", "D", "F", "F", "F", "F", "F", "F", "F",
"F", "F", "F"), B1 = c(20.56, 19.7, 22.9, 21.1, 20.11, 22.98,
19.17, 21.67, 21.56, 20.56, 28.91, 28.01, 28.45, 29.23, 28.34,
28.1, 29.03, 28.22, 29.36, 29.87, 22.56, 21.48, 17.63, 20.78,
24.79, 25, 24.67, 23.51, 19.47, 22.85, 27.98, 28.1, 28.2, 28.22,
28.15, 28.97, 29.43, 29.05, 29.37, 29.39, 25.3, 24.56, 22.76,
23.47, 22.73, 24.98, 20.56, 27.1, 25.87, 23.46, 29.03, 29.67,
29.56, 28.69, 28.93, 29.01, 29.73, 29.77, 28.79, 28.83), B2 = c(19.78,
20.98, 22.27, 21.68, 21.56, 24.86, 23.45, 24.61, 23.56, 21.46,
28.56, 28.74, 28.37, 29.04, 29.85, 28.15, 27.99, 29.88, 28.74,
28.57, 21.47, 20.48, 25.12, 21.13, 22.76, 18.48, 22.76, 23.91,
17.27, 24.26, 28.64, 28.73, 28.47, 28.38, 28.26, 28.88, 29.06,
29.28, 29.59, 29.64, 21.45, 22.56, 27.45, 23.11, 20.03, 20.9,
21.1, 25.02, 24.16, 22.71, 28.54, 27.09, 29.03, 29.47, 29.58,
29.38, 28.05, 29.74, 28.5, 27.3), B3 = c(20.24, 16.42, 23.51,
22.41, 21.63, 24.61, 24.11, 23.57, 18.31, 19.61, 28.27, 29.07,
26.98, 29.33, 28.19, 28.54, 29.08, 29.7, 29.59, 29.58, 17.9,
21.45, 20.56, 22.74, 23.59, 20.01, 21.17, 22.11, 24.14, 23.35,
28.16, 28.38, 28.47, 28.94, 28.46, 27.47, 26.45, 28.49, 29.05,
29.79, 20.98, 26.93, 20.75, 19.63, 24.72, 24.07, 17.26, 25.66,
21.23, 21.78, 29.79, 29.64, 29.57, 29.32, 29.48, 29.77, 29.05,
29.11, 28.97, 29.59), B4 = c(25.61, 20.12, 19.42, 22.67, 24.31,
23.12, 18.24, 17.24, 21.58, 22.48, 29.01, 29.7, 28.77, 28.59,
28.74, 28.49, 28.08, 28.39, 28.4, 28.67, 19.02, 18.65, 20.72,
21.61, 20.41, 22.01, 23.71, 20.05, 22.13, 20.1, 28.46, 28.47,
28.38, 29.06, 28.48, 28.73, 27.9, 29.59, 29.4, 28.38, 24.31,
19.09, 24.89, 24.64, 21.47, 25.04, 22.51, 21.1, 20.27, 23.64,
28.57, 28.08, 29.19, 29.61, 29.84, 28.07, 29.18, 29.59, 29.58,
28.22), N = c(4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-60L))
the code used is this:
library(ggplot2)
ggplot(alt2, aes(x = Treatment)) +
geom_bar(aes(y=len, fill=TIME),
stat = "identity", alpha = 0.5, position =
position_dodge()
) +
scale_fill_manual(values = c("grey", "black", "blue")) +
geom_errorbar(aes(ymin = len-se, ymax = len+se, group = TIME),
width = 0.2, colour = "black",
position = position_dodge(0.9)
) +
theme_light() +
xlab("doses") +
ylab("len")
I dont understand why i got that figure.
I am working with 3 doses(1,2,3) and in different times (M1,M2,M3).
But I am getting this figure with multiple error lines. How can I solve this ?.

Related

Summary of all possible combinations of variables [closed]

Closed. This question needs details or clarity. It is not currently accepting answers.
Want to improve this question? Add details and clarify the problem by editing this post.
Closed 2 years ago.
Improve this question
I have a data frame which is individual-level data of people in a country. In the said data frame I have information on county or municipality of residence, sex, age, race and cancer status. I want to aggregate the data into a new data frame ordered by counties and stratified by age (in categories), sex and race. That is, create subgroups defined by a combination of these multiple variables. The original data has a structure similar to the fictitious data below.
structure(list(Person_ID = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40), County_ID = c(1,
1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4,
4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6), Age = c(39,
21, 65, 87, 19, 16, 48, 52, 31, 19, 24, 44, 38,
39, 40, 27, 69, 71, 52, 53, 80, 23,
21, 29, 38, 34, 39, 73, 54, 50, 52,
43, 55, 57, 37, 24, 44, 37, 38,
40), Sex = c("F", "F", "F", "M", "M", "M", "F",
"M", "M", "F", "F", "F", "M", "M", "F", "F", "M", "M", "M", "M",
"M", "F", "F", "F", "M", "F", "F", "M", "M", "M", "F", "F", "F",
"F", "F", "F", "F", "F", "M", "M"), Race = c(1, 2, 1, 2, 3, 3,
3, 1, 1, 2, 2, 1, 2, 1, 2, 3, 3, 3, 2, 1, 2, 2, 3, 1, 3, 2, 3,
1, 2, 3, 3, 1, 2, 2, 2, 3, 1, 1, 2, 2), `Cancer-status` = c(0,
0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0)), row.names = c(NA,
-40L), class = c("tbl_df", "tbl", "data.frame"))
with a structure like
Person_ID
County_ID
Age
Sex
Race
Cancer_status
1
1
30
M
1
1
2
1
41
M
2
0
3
1
19
F
1
0
4
1
37
F
3
1
5
2
28
F
3
0
6
3
65
M
1
1
where Cancer_status is a dummy or binary variable and Race is a factor variable.
And I want a new data frame in the format below (similar to the data structure of pennLC$data in SpatialEpi package). With the counts of cancer and population ordered by county and sorted by the 3 strata (race, sex and age). The new age variable is a factor or categorical variable.
county
cancer
pop_county
race
Sex
age
1
0
1492
1
F
Under 40
1
0
365
1
F
40-59
1
1
68
1
F
60-69
1
0
73
1
F
70+
1
0
23351
2
F
Under 40
1
5
12136
2
F
40-59
Thank you,
I'm assuming you want dplyr. Given your sample data, try this:
library(dplyr)
DF %>%
mutate(Age = cut(Age, c(0, 40, 60, 70, Inf), right = FALSE)) %>%
group_by(County_ID, Race, Sex, Age) %>%
summarize(cancer = sum(`Cancer-status`), pop_county = n()) %>%
ungroup()
# # A tibble: 37 x 6
# County_ID Race Sex Age cancer pop_county
# <dbl> <dbl> <chr> <fct> <dbl> <int>
# 1 1 1 F [0,40) 0 1
# 2 1 1 F [60,70) 0 1
# 3 1 2 F [0,40) 0 1
# 4 1 2 M [70,Inf) 0 1
# 5 1 3 M [0,40) 0 1
# 6 2 1 M [0,40) 1 1
# 7 2 1 M [40,60) 0 1
# 8 2 2 F [0,40) 1 2
# 9 2 3 F [40,60) 0 1
# 10 2 3 M [0,40) 0 1
# # ... with 27 more rows
You'll need to relabel the Age factor,

map over and arrange a list based on the list names

I have a list which looks like:
List of 8
$ 9 :Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 40 obs. of 2 variables:
..$ date: Date[1:40], format: "2014-03-22" "2019-03-18" "2018-04-28" ...
..$ .id : num [1:40] 9 9 9 9 9 9 9 9 9 9 ...
$ c(1, 7) :Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 40 obs. of 2 variables:
..$ date: Date[1:40], format: "2004-08-26" "2012-10-21" "2007-03-10" ...
..$ .id : num [1:40] 7 7 1 7 7 7 7 1 7 7 ...
$ c(13, 18) :Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 40 obs. of 2 variables:
..$ date: Date[1:40], format: "2016-01-31" "2016-03-24" "2018-10-17" ...
..$ .id : num [1:40] 13 13 13 18 13 18 13 13 13 13 ...
$ c(18, 2, 7, 13):Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 40 obs. of 2 variables:
..$ date: Date[1:40], format: "2013-04-05" "2019-04-23" "2005-03-05" ...
..$ .id : num [1:40] 13 2 7 2 2 13 13 7 13 7 ...
$ c(19, 5) :Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 40 obs. of 2 variables:
..$ date: Date[1:40], format: "2018-04-10" "2016-08-03" "2012-05-18" ...
..$ .id : num [1:40] 5 19 5 5 5 5 5 5 19 5 ...
$ c(2, 7, 18) :Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 40 obs. of 2 variables:
..$ date: Date[1:40], format: "2018-02-01" "2011-03-08" "2009-09-29" ...
..$ .id : num [1:40] 7 7 2 18 2 18 2 2 7 2 ...
$ c(5, 19) :Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 40 obs. of 2 variables:
..$ date: Date[1:40], format: "2011-05-14" "2005-08-31" "2015-07-06" ...
..$ .id : num [1:40] 19 5 5 5 5 19 5 5 5 5 ...
$ c(7, 1, 2, 18) :Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 40 obs. of 2 variables:
..$ date: Date[1:40], format: "2003-04-12" "2014-12-03" "2001-02-21" ...
..$ .id : num [1:40] 7 1 1 7 2 1 1 18 2 1 ...
The names of the list are the following:
9
c(1, 7)
c(13, 18)
c(18, 2, 7, 13)
c(19, 5)
c(2, 7, 18)
c(5, 19)
c(7, 1, 2, 18)
Two of the lists look like:
$`c(19, 5)`
# A tibble: 40 x 2
date .id
<date> <dbl>
1 2018-04-10 5
2 2016-08-03 19
3 2012-05-18 5
4 2007-09-11 5
5 2011-11-03 5
6 2007-04-09 5
7 2001-07-12 5
8 2018-07-30 5
9 2013-07-30 19
10 2001-08-13 5
# ... with 30 more rows
$`c(2, 7, 18)`
# A tibble: 40 x 2
date .id
<date> <dbl>
1 2018-02-01 7
2 2011-03-08 7
3 2009-09-29 2
4 2014-07-30 18
5 2004-04-17 2
6 2016-11-21 18
7 2007-10-27 2
8 2009-02-08 2
9 2016-01-18 7
10 2010-09-27 2
# ... with 30 more rows
What I would like to do is to arrange the lists by the .id and the date columns. However the .id arranged by the order it appears in the list names. So for the c(19, 5) list the 19 would be first (as well as ordered by date) and the 5 would be second (as well as ordered by date). For the c(5, 19) list the 5 would be ordered first (as well as ordered by date) and the 19 would be second (as well as ordered by date).
Any advice on how to do this would be great.
Data:
lst <- list(`9` = structure(list(date = structure(c(16151, 17973, 17649,
17738, 17388, 13927, 11594, 13095, 15312, 12030, 13805, 13240,
15660, 15926, 11645, 12139, 17853, 15328, 12561, 13595, 14147,
12142, 14112, 14083, 16057, 13074, 11458, 14735, 12892, 16139,
11935, 17666, 14789, 12231, 12343, 17012, 13099, 17682, 15150,
14195), class = "Date"), .id = c(9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9)), row.names = c(NA, -40L), class = c("tbl_df",
"tbl", "data.frame")), `c(1, 7)` = structure(list(date = structure(c(12656,
15634, 13582, 17498, 15079, 12265, 18031, 17399, 11603, 13886,
16876, 16022, 16303, 17776, 12717, 15154, 12950, 13693, 17561,
16963, 15690, 12581, 14883, 18010, 14280, 12672, 16108, 14347,
14326, 14628, 17913, 13771, 15369, 14765, 12067, 16397, 11555,
14855, 16308, 12824), class = "Date"), .id = c(7, 7, 1, 7, 7,
7, 7, 1, 7, 7, 1, 1, 7, 1, 7, 1, 1, 1, 7, 1, 7, 1, 1, 1, 1, 1,
1, 7, 7, 1, 7, 7, 7, 7, 1, 1, 7, 7, 1, 1)), row.names = c(NA,
-40L), class = c("tbl_df", "tbl", "data.frame")), `c(13, 18)` = structure(list(
date = structure(c(16831, 16884, 17821, 15686, 14680, 16428,
17462, 15693, 14707, 16889, 17534, 17556, 15243, 17308, 16886,
17212, 15199, 15669, 17761, 17103, 16992, 17396, 17584, 15904,
15643, 16748, 17554, 16822, 17184, 16264, 15425, 16715, 15268,
15205, 14772, 17285, 17184, 16112, 15327, 17100), class = "Date"),
.id = c(13, 13, 13, 18, 13, 18, 13, 13, 13, 13, 13, 13, 18,
13, 18, 13, 13, 13, 18, 18, 13, 13, 13, 13, 18, 18, 13, 13,
13, 18, 13, 13, 13, 13, 13, 13, 18, 18, 18, 13)), row.names = c(NA,
-40L), class = c("tbl_df", "tbl", "data.frame")), `c(18, 2, 7, 13)` = structure(list(
date = structure(c(15800, 18009, 12847, 12378, 12365, 14864,
14961, 14562, 15723, 15856, 11545, 11755, 15080, 13149, 12655,
14898, 13067, 14375, 15499, 16681, 15682, 18030, 15732, 14452,
17624, 15741, 17894, 12768, 17295, 12015, 16533, 13589, 17072,
14678, 14067, 14348, 16846, 18125, 17826, 16874), class = "Date"),
.id = c(13, 2, 7, 2, 2, 13, 13, 7, 13, 7, 7, 7, 7, 2, 7,
7, 7, 7, 7, 18, 13, 13, 18, 7, 2, 7, 7, 7, 13, 2, 2, 2, 7,
18, 7, 2, 2, 18, 13, 18)), row.names = c(NA, -40L), class = c("tbl_df",
"tbl", "data.frame")), `c(19, 5)` = structure(list(date = structure(c(17631,
17016, 15478, 13767, 15281, 13612, 11515, 17742, 15916, 11547,
12959, 16713, 12521, 12457, 12174, 18054, 16407, 13462, 14704,
16642, 12551, 16289, 12034, 17676, 16486, 15009, 17220, 16753,
13335, 12498, 12697, 17725, 17833, 16329, 17182, 16435, 11475,
14732, 15210, 17823), class = "Date"), .id = c(5, 19, 5, 5, 5,
5, 5, 5, 19, 5, 5, 19, 5, 5, 5, 19, 5, 5, 5, 5, 5, 5, 5, 5, 19,
5, 5, 5, 5, 5, 5, 19, 19, 19, 19, 5, 5, 19, 5, 5)), row.names = c(NA,
-40L), class = c("tbl_df", "tbl", "data.frame")), `c(2, 7, 18)` = structure(list(
date = structure(c(17563, 15041, 14516, 16281, 12525, 17126,
13813, 14283, 16818, 14879, 15860, 16616, 17303, 15356, 14899,
14306, 15254, 17836, 12555, 15367, 17721, 16216, 16787, 16603,
14723, 13608, 13276, 17852, 16922, 17774, 14676, 16696, 17059,
15518, 13829, 14623, 17787, 14534, 17579, 15137), class = "Date"),
.id = c(7, 7, 2, 18, 2, 18, 2, 2, 7, 2, 7, 7, 18, 7, 7, 7,
7, 18, 7, 2, 7, 2, 7, 2, 2, 7, 2, 18, 18, 2, 18, 18, 2, 2,
7, 2, 7, 2, 2, 7)), row.names = c(NA, -40L), class = c("tbl_df",
"tbl", "data.frame")), `c(5, 19)` = structure(list(date = structure(c(15108,
13026, 16622, 12813, 11591, 15364, 16033, 16594, 15353, 14652,
14697, 17160, 17084, 16686, 13560, 11401, 16433, 11722, 17606,
15924, 16235, 17817, 16172, 14612, 12021, 17276, 18080, 16222,
16849, 14746, 14036, 17850, 11350, 15036, 15577, 14833, 16464,
15322, 15988, 17023), class = "Date"), .id = c(19, 5, 5, 5, 5,
19, 5, 5, 5, 5, 19, 19, 19, 19, 5, 5, 19, 5, 19, 5, 19, 19, 5,
19, 5, 19, 5, 19, 19, 19, 5, 19, 5, 19, 5, 19, 5, 5, 19, 19)), row.names = c(NA,
-40L), class = c("tbl_df", "tbl", "data.frame")), `c(7, 1, 2, 18)` = structure(list(
date = structure(c(12154, 16407, 11374, 12594, 13229, 13812,
12462, 16255, 16181, 15333, 15337, 16019, 14551, 16383, 13281,
15422, 12951, 17836, 16740, 12130, 18142, 16458, 18148, 15173,
12506, 15581, 15244, 16519, 15785, 17916, 17575, 15128, 15274,
15808, 12137, 16425, 15927, 14696, 12771, 12894), class = "Date"),
.id = c(7, 1, 1, 7, 2, 1, 1, 18, 2, 1, 2, 2, 1, 7, 7, 1,
1, 18, 2, 2, 2, 1, 18, 2, 1, 1, 7, 18, 7, 18, 2, 18, 1, 7,
2, 1, 7, 2, 2, 2)), row.names = c(NA, -40L), class = c("tbl_df",
"tbl", "data.frame")))
You could do something like the following:
# Loop over names of list
newlist <- lapply(names(lst), function(i) {
# Subset list by name
thislist <- lst[[i]]
# evaluate the list name
i <- eval(parse(text = i))
# order list
thislist[order(factor(thislist$.id, levels = as.character(i))),]
})
We can use imap from purrr and use match and order to order each dataframe
purrr::imap(lst, ~.x[order(match(.x$.id, eval(parse(text = .y)))), ])
#$`9`
# A tibble: 40 x 2
# date .id
# <date> <dbl>
# 1 2014-03-22 9
# 2 2019-03-18 9
# 3 2018-04-28 9
# 4 2018-07-26 9
# 5 2017-08-10 9
# 6 2008-02-18 9
# 7 2001-09-29 9
# 8 2005-11-08 9
# 9 2011-12-04 9
#10 2002-12-09 9
# … with 30 more rows
#$`c(1, 7)`
# A tibble: 40 x 2
# date .id
# <date> <dbl>
# 1 2007-03-10 1
# 2 2017-08-21 1
# 3 2016-03-16 1
# 4 2013-11-13 1
# 5 2018-09-02 1
# 6 2011-06-29 1
# 7 2005-06-16 1
# 8 2007-06-29 1
# 9 2016-06-11 1
#10 2004-06-12 1
# … with 30 more rows
#....
#.....
In base R, that can be achieved using Map
Map(function(x, y) x[order(match(x$.id, y)), ], lst,
lapply(names(lst), function(x) eval(parse(text = x))))

Get position indices after min / max aggregation on matrix

Sequel to my first problem here (How to aggregate hourly values into 24h-average means without timestamp).
Now I want to calculate the max (and min) from my timeseries of each 12-hour interval.
I have got my hourly data measurements (data_measure). Now I changed it into a time series of half-days.
t_measure <- ts(data = data_measure, frequency = 12)
then I used the aggregate function from {stats}
data_measure_daily_max <- aggregate(t_measure, 1, max)
data_measure <- structure(c(8.29, 7.96, 8.14, 7.27, 7.37, 7.3, 7.23, 7.53,
7.98, 10.2, 12.39, 14.34, 14.87, 14.39, 12.54, 11.84, 10.3, 10.62,
10.65, 10.56, 10.43, 10.35, 9.85, 9.12, 8.95, 8.82, 8.92, 9.33,
9.44, 9.3, 9.15, 9.37, 9.54, 10.24, 12.13, 12.43, 12.65, 13,
13.18, 13.58, 13.64, 13.75, 13.85, 13.94, 13.79, 13.84, 13.94,
14.26, 24.93, 24.64, 23.67, 21.46, 21.33, 20.83, 21.12, 21.1,
23.75, 25.39, 30.72, 30.71, 30.81, 30.92, 32.61, 32.37, 32.49,
30.68, 30.23, 30.45, 28.1, 26.9, 25.09, 25.07, 24.59, 24.22,
23.05, 22.21, 22.07, 21.6, 21.24, 21.22, 21.85, 24.87, 28.85,
29.42, 30.82, 30.97, 31.32, 30.81, 30.83, 29.9, 30.01, 30.31,
30, 27.91, 25.78, 25.88, 8.78, 8.47, 8.49, 7.65, 8.63, 9.02,
9.02, 8.11, 7.63, 9.19, 11.25, 12.24, 13.62, 12.09, 10.6, 11.1,
10.16, 10.44, 9.58, 10.04, 10.01, 10.23, 9.51, 9.2, 9.34, 9.6,
9.4, 9.45, 9.36, 9.26, 9.3, 9.46, 9.58, 9.89, 10.6, 11.04, 12.1,
12.61, 13.12, 13.47, 13.55, 13.51, 13.63, 13.84, 13.93, 14.17,
13.97, 13.86), .Dim = c(48L, 3L), .Dimnames = list(NULL, c("station1",
"station2", "station3")))
So actually I need an index/vector which tells me where my max and min of these time intervals are, so later on I can extract exactly these for an other data sets to make a comparison.
My first trial:
max_index <- which(aggregate(t_measure, 1, max)) # argument to 'which' is not logical
Use which.max and which.min with aggregate
a1 <- aggregate(t_measure, 1, which.min)
a2 <- aggregate(t_measure, 1, which.max)
a1
#Time Series:
#Start = 1
#End = 4
#Frequency = 1
# station1 station2 station3
#1 7 6 9
#2 12 12 12
#3 2 8 6
#4 1 11 1
a2
#Time Series:
#Start = 1
#End = 4
#Frequency = 1
# station1 station2 station3
#1 12 11 12
#2 1 3 1
#3 12 12 12
#4 12 3 10
If you want index for min with reference to original data_measure dataframe we can do
vals <- nrow(t_measure)/12
index_min <- a1 + (12 * (seq_len(vals) - 1))
index_min
#Time Series:
#Start = 1
#End = 4
#Frequency = 1
# station1 station2 station3
#1 7 6 9
#2 24 24 24
#3 26 32 30
#4 37 47 37
This can be read as for station1 in 1st 12 hour interval max value is present in 7th row of data_measure, for next 12 hour interval it is present in 24th row and same for other stations.

Impute missing values with average of previous 13 values

I have a dataset with few missing observations. My objective is to impute the missing value in each variable with the average of previous 13 values. In case there is a missing value before the 13th observation, the average of whatever there before should be used for imputing that variable. I am not sure how to do it.
Please use the below to replicate my dataset. Your help is much appreciated.
df1 <- structure(list(V1 = c(276.12, 53.4, 20.64, 181.8, 216.96, 10.44,
69, 144.24, 10.32, 239.76, 79.32, 257.64, 28.56, 117, 244.92,
234.48, NA, 337.68, 83.04, 176.76, 262.08, 284.88, 15.84, NA,
74.76, 315.48, 171.48, 288.12, 298.56, 84.72, 351.48, 135.48,
NA, 318.72, 114.84, 348.84, 320.28, 89.64, 51.72, 273.6, 243,
212.4, 352.32, 248.28, NA, 210.12, 107.64, 287.88, 272.64, 80.28,
239.76, 120.48, 259.68, 219.12, 315.24, 238.68, 8.76, 163.44,
252.96), V2 = c(45.36, 47.16, 55.08, 49.56, 12.96, 58.68, 39.36,
NA, 2.52, 3.12, 6.96, 28.8, NA, 9.12, 39.48, 57.24, 43.92, 47.52,
24.6, 28.68, 33.24, 6.12, 19.08, 20.28, 15.12, 4.2, 35.16, NA,
32.52, 19.2, 33.96, 20.88, 1.8, 24, 1.68, NA, 52.56, 59.28, 32.04,
45.24, 26.76, 40.08, 33.24, 10.08, 30.84, 27, 11.88, 49.8, 18.96,
14.04, 3.72, 11.52, 50.04, 55.44, 34.56, NA, 33.72, 23.04, 59.52
)), class = "data.frame", row.names = c(NA, -59L))
You can use zoo::rollapply to compute the mean over the 13 values:
mean13 = zoo::rollapply(
df1$V1,
13,
function(x) {
mean(na.omit(x))
},
align = "right",
fill = NA,
partial = TRUE
)
df1$V1_prev_mean = c(df1$V1[1], head(mean13, -1))
df1$V1 = ifelse(is.na(df1$V1), df1$V1_prev_mean, df1$V1)
Output:
V1 V2 V1_prev_mean
1 276.1200 45.36 276.1200
2 53.4000 47.16 276.1200
3 20.6400 55.08 164.7600
4 181.8000 49.56 116.7200
5 216.9600 12.96 132.9900
6 10.4400 58.68 149.7840
7 69.0000 39.36 126.5600
8 144.2400 NA 118.3371
9 10.3200 2.52 121.5750
10 239.7600 3.12 109.2133
11 79.3200 6.96 122.2680
12 257.6400 28.80 118.3636
13 28.5600 NA 129.9700
14 117.0000 9.12 122.1692
15 244.9200 39.48 109.9292
16 234.4800 57.24 124.6615
17 141.1108 43.92 141.1108 # <- this row filled
18 337.6800 47.52 137.7200
19 83.0400 24.60 147.7800
20 176.7600 28.68 153.8300

How to set to 0 all values that appeares less than k times in variables within nested df

library(tidyverse)
ex <- structure(list(group = c("Group A", "Group B", "Group C"), data = list(
structure(list(a = c(25.1, 15.1, 28.7, 29.7, 5.3, 3.4, 5.3,
10.1, 2.4, 18, 4.7, 22.1, 9.5, 3.1, 26.5, 5.1, 24, 22.5,
19.4, 22.9, 24.5, 18.2, 7.9, 5.3, 24.7), b = c(95.1, 51,
100, 94.1, 47.3, 0, 50.7, 45.8, 40.7, 49.4, 51.9, 76.4, 26.7,
19.8, 37.4, 59.4, 59.1, 60.2, 26.1, 2.8, 100, 40.7, 56.4,
42.5, 0), c = c(39.9, 42.7, 16.3, 11.1, 56.9, 17.8, 62, 28.1,
43, 44.8, 54.8, 8.7, 5.5, 40.2, 7.7, 60.7, 24.8, 7.5, 3.5,
16.9, 31.6, 45.8, 76.7, 58.6, 15.8), d = c(-2.39999999999999,
28.6, -4.59999999999999, -1.39999999999999, 10.3, 3.1, 23.4,
-43, -36.3, 32.4, 33.1, 9.8, 1.5, -17.6, 16.6, 20.9, 7.8,
-1.7, -23.3, 0, -15, 59.3, -40.2, 46.9, 4.7)), .Names = c("a",
"b", "c", "d"), row.names = c(NA, -25L), class = c("tbl_df",
"tbl", "data.frame")), structure(list(a = c(5, 4.7, 30.3,
14.3, 31.6, 6, 4.9, 23.3, 26.9, 16.9, 27.2, 23.8, 19.9, 28.6,
9.9, 17.4, 14.3, 12.5, 30.4, 30.3, 30, 6, 18, 23.7, 5.1),
b = c(48.9, 41.3, 20.1, 63.7, 85.1, 30.3, 52.8, 49.7,
27.1, 51.6, 21.8, 52.4, 52.5, 59.6, 13.7, 53.1, 69, 66.9,
23.4, 35.4, 45.8, 23.7, 62.9, 90.3, 59.6), c = c(37.4,
18.5, 64.6, 13.5, 7.8, 6.8, 12.7, 8.5, 7.8, 5.4, 14.1,
20.5, 10.9, 10.5, 7.5, 14.7, 6.9, 0.699999999999999,
4.7, 1.9, 11.9, 0.9, 7.2, 9.2, 42.2), d = c(4.9, -3.7,
13.5, 21.9, -2.69999999999999, 6.6, 0.5, -12.3, 38.7,
-25.8, -18, 28.4, 38.3, -3.6, 39.4, 19, 23.4, -38.7,
17, 36.3, -31.7, -9.3, -10.5, 9.7, -10.6)), .Names = c("a",
"b", "c", "d"), row.names = c(NA, -25L), class = c("tbl_df",
"tbl", "data.frame")), structure(list(a = c(29.9, 12.8, 23.9,
26.2, 27.5, 32.6, 33.2, 24.8, 29, 22.6, 4.7, 25.6, 4.7, 13.1,
25.9, 14.5, 23.5, 26.6, 12.8, 24.1, 9.1, 31.9, 24.8, 4.6,
17.9), b = c(63.7, 23.3, 71.2, 46.7, 30.6, 49.3, 14.6, 68.4,
27.9, 49.1, 60.5, 26.4, 56.9, 55.4, 37.9, 40.7, 32.7, 68.5,
42.7, 27.9, 67.5, 43.4, 76.6, 53.3, 26.8), c = c(1.6, 32,
18.6, 14, 0.5, 7.2, 27.3, 8.9, 11, 15.5, 16.7, 16.4, 63.1,
14.7, 6.8, 9, 3.1, 11.7, 11, 11.5, 10.6, 14.9, 7.1, 13.2,
5.1), d = c(-35.4, 21, 12, 1.8, 37.6, 9.2, 17.6, 0, -19.4,
32.6, -32, -3.6, 7.2, -25.7, 9.1, -8, 35.8, 24.8, -13.9,
-21.7, -28.7, 0.200000000000003, -16.9, -26.5, 26.2)), .Names = c("a",
"b", "c", "d"), row.names = c(NA, -25L), class = c("tbl_df",
"tbl", "data.frame"))), h_candidates = list(structure(c(0.17320508075689, 2.37782856461527, 2.94890646051978, 3.35205778704499, 3.66771041547043, 3.95224618679369), .Names = c("0%", "0.01%", "0.02%", "0.03%", "0.04%", "0.05%")), structure(c(0.316227766016836, 2.63452963884554, 3.2327619513522, 3.63593179253957, 3.97743636027027, 4.22137418384109), .Names = c("0%", "0.01%", "0.02%", "0.03%", "0.04%", "0.05%")), structure(c(0.316227766016837, 2.7258026340878, 3.24807635378234, 3.62353418639869, 3.92683078321437, 4.17731971484109), .Names = c("0%", "0.01%", "0.02%", "0.03%", "0.04%", "0.05%"))), assignment = list(
structure(list(`0%` = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25),
`0.01%` = c(1, 2, 3, 3, 4, 5, 4, 6, 7, 8, 9, 10, 11,
12, 13, 4, 14, 15, 16, 17, 18, 19, 20, 21, 17), `0.02%` = c(1,
2, 3, 3, 4, 5, 4, 6, 7, 8, 9, 10, 11, 12, 13, 4, 14,
15, 16, 17, 18, 19, 20, 21, 17), `0.03%` = c(1, 2, 3,
3, 4, 5, 4, 6, 7, 8, 9, 10, 11, 12, 13, 4, 10, 14, 15,
16, 17, 18, 19, 9, 16), `0.04%` = c(1, 2, 3, 4, 5, 6,
5, 7, 8, 9, 10, 11, 12, 13, 14, 5, 11, 15, 16, 17, 18,
19, 20, 10, 17)), .Names = c("0%", "0.01%", "0.02%",
"0.03%", "0.04%"), row.names = c(NA, -25L), class = c("tbl_df",
"tbl", "data.frame")), structure(list(`0%` = c(1, 2, 3, 4,
5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
21, 22, 23, 24, 25), `0.01%` = c(1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16, 4, 17, 18, 19, 20, 21, 22,
23, 24), `0.02%` = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 13, 4, 16, 17, 9, 18, 19, 14, 20, 21), `0.03%` = c(1,
2, 3, 4, 5, 6, 2, 7, 8, 9, 10, 11, 12, 13, 14, 12, 4, 15,
6, 8, 16, 17, 13, 18, 19), `0.04%` = c(1, 2, 3, 4, 5, 6,
2, 7, 8, 9, 10, 11, 12, 13, 14, 12, 4, 15, 6, 8, 7, 16, 13,
17, 1)), .Names = c("0%", "0.01%", "0.02%", "0.03%", "0.04%"
), row.names = c(NA, -25L), class = c("tbl_df", "tbl", "data.frame"
)), structure(list(`0%` = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25
), `0.01%` = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 12, 15, 16, 17, 15, 18, 19, 4, 20, 21, 22), `0.02%` = c(1,
2, 3, 4, 5, 6, 7, 8, 9, 5, 10, 11, 12, 13, 11, 14, 5, 15,
14, 16, 17, 18, 8, 19, 20), `0.03%` = c(1, 2, 3, 4, 5, 6,
7, 3, 8, 9, 10, 11, 12, 10, 11, 13, 5, 14, 13, 8, 10, 4,
3, 13, 6), `0.04%` = c(1, 2, 3, 4, 5, 5, 6, 3, 7, 8, 9, 10,
11, 9, 10, 12, 5, 13, 12, 7, 9, 4, 3, 12, 5)), .Names = c("0%",
"0.01%", "0.02%", "0.03%", "0.04%"), row.names = c(NA, -25L
), class = c("tbl_df", "tbl", "data.frame")))), .Names = c("group", "data", "h_candidates", "assignment"), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -3L))
With the data structured like above I would like to change all values within assignment data.frames that appears less than k times (let's say k = 5) in a column.
So I need a solution that takes subsequent data.frames, then subsequent columns within a data.frame, check which values appears less than 5 times in a column and if there are any just replace them with 0.
At best, the solution would involve tidyverse functions. I think that nested purrr::map, as well as dplyr::mutate are needed here, but don't know how to count appearances within a column and replace the values then.
You can use purrr::map() to loop over the list column with the dataframes,
and then purrr::modify() to loop over each column in each dataframe. Then
it's just a matter of defining a function that counts occurences of values in
a vector, and replaces them if the count is less than k:
library(tidyverse)
ex %>%
mutate(assignment = map(assignment, modify, function(x, k) {
n <- table(x)[as.character(x)]
replace(x, n < k, 0)
}, k = 5))
#> # A tibble: 3 x 4
#> group data h_candidates assignment
#> <chr> <list> <list> <list>
#> 1 Group A <tibble [25 x 4]> <dbl [6]> <tibble [25 x 5]>
#> 2 Group B <tibble [25 x 4]> <dbl [6]> <tibble [25 x 5]>
#> 3 Group C <tibble [25 x 4]> <dbl [6]> <tibble [25 x 5]>
We can also define a couple of helper functions to make this more readable:
# Replace elements in x given by f(x) with val
replace_if <- function(x, f, val, ...) {
replace(x, f(x, ...), val)
}
appears_less_than <- function(x, k) {
table(x)[as.character(x)] < k
}
Combining these two functions gets what we are after:
replace_if(c(1, 1, 2, 3), appears_less_than, k = 2, 0)
#> [1] 1 1 0 0
Now all that remains is to put the pieces together:
res <- ex %>%
mutate(assignment = map(assignment, modify, replace_if,
appears_less_than, k = 3, 0))
As #thothal mentioned, there aren't any values in your data that occur more
than 4 times in your data, but with k = 3 we can have a look at the result
(to illustrate, just the 3rd dataframe in assignment):
res %>% pluck("assignment", 3)
#> # A tibble: 25 x 5
#> `0%` `0.01%` `0.02%` `0.03%` `0.04%`
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 0 0 0 0 0
#> 2 0 0 0 0 0
#> 3 0 0 0 3 3
#> 4 0 0 0 0 0
#> 5 0 0 5 0 5
#> 6 0 0 0 0 5
#> 7 0 0 0 0 0
#> 8 0 0 0 3 3
#> 9 0 0 0 0 0
#> 10 0 0 5 0 0
#> # ... with 15 more rows
Finally, we could also use a scoped mutate_at() to further reduce some of
the excess syntax:
ex %>%
mutate_at(vars(assignment), map, modify,
replace_if, appears_less_than, k = 3, 0)
Created on 2018-08-08 by the reprex package (v0.2.0.9000).
This should do the trick:
library(tidyverse)
ex %>%
mutate(
assignment = map(assignment,
~ rowid_to_column(.x, "id") %>%
gather(key, value, -id) %>%
group_by(key) %>%
add_count(value) %>%
mutate(value = ifelse(n < 5, 0, n)) %>%
select(-n) %>%
spread(key, value) %>%
select(-id)
)
)
Note in your example there is no single value appearing more than 4 times.
Explanation
You map over all assignment data.frames
For each data.frame you first add an id column (needed for gather/spread)
Then you gather all columns butidinto akey(former column names)value` (the values) pair
For each group of former columns (now in key) you add a counter of the values in value
Then you replace occurrences which appear less than 5 times by 0
You remove n (the counter)
spread the data back into the original format
Remove the id column

Resources