I have a dataframe that looks like the following (dput at the end):
region type age_group year value
AO1 p 0 1990 12
AO1 p 5 1990 10
AO1 p 10 1990 8
AO1 p 15 1990 14
AO1 p 20 1990 19
...
AO1 p 80 1990 12
AO1 p 1 1990 0.54
AO1 p 2 1990 0.46
AO1 p 3 1990 1
where the last three lines express the percentage of males (1) and female (2) and total (3).
What I would like to do is to produce two more variables value.m and value.f by multiplying value by the correct percentage
In this case, value.m would use 0.54 and value.f 0.46 for year 1990 in region AO1
dt$value.m <- dt %>%
group_by(region, type, age_num, year) %>%
mutate(value.m=value*???)
Any ideas?
dt <- structure(list(region = structure(c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 1L, 1L, 1L, 4L, 4L, 4L, 2L, 2L, 2L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 1L, 1L, 1L, 4L, 4L, 4L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 1L, 1L, 1L, 4L, 4L,
4L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 3L, 3L, 3L, 1L, 1L, 1L, 4L, 4L, 4L, 2L, 2L, 2L), .Label =
c("AO1", "AO11", "AO22", "AO3"), class = "factor"), age = structure(c(1L,
10L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 11L, 12L, 13L, 14L, 15L, 16L,
17L, 1L, 10L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 11L, 12L, 13L,
14L, 15L, 16L, 17L, 1L, 10L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
11L, 12L, 13L, 14L, 15L, 16L, 17L, 1L, 10L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 19L, 18L, 20L,
19L, 18L, 20L, 19L, 18L, 20L, 19L, 18L, 20L, 21L, 30L, 22L, 23L,
24L, 25L, 26L, 27L, 28L, 29L, 31L, 32L, 33L, 34L, 35L, 36L, 37L,
21L, 30L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 31L, 32L, 33L,
34L, 35L, 36L, 37L, 21L, 30L, 22L, 23L, 24L, 25L, 26L, 27L, 28L,
29L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 21L, 30L, 22L, 23L, 24L,
25L, 26L, 27L, 28L, 29L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 39L,
38L, 40L, 39L, 38L, 40L, 39L, 38L, 40L, 39L, 38L, 40L, 1L, 10L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 11L, 12L, 13L, 14L, 15L, 16L,
17L, 1L, 10L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 11L, 12L, 13L,
14L, 15L, 16L, 17L, 1L, 10L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
11L, 12L, 13L, 14L, 15L, 16L, 17L, 1L, 10L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 19L, 18L, 20L,
19L, 18L, 20L, 19L, 18L, 20L, 19L, 18L, 20L, 21L, 30L, 22L, 23L,
24L, 25L, 26L, 27L, 28L, 29L, 31L, 32L, 33L, 34L, 35L, 36L, 37L,
21L, 30L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 31L, 32L, 33L,
34L, 35L, 36L, 37L, 21L, 30L, 22L, 23L, 24L, 25L, 26L, 27L, 28L,
29L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 21L, 30L, 22L, 23L, 24L,
25L, 26L, 27L, 28L, 29L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 39L,
38L, 40L, 39L, 38L, 40L, 39L, 38L, 40L, 39L, 38L, 40L), .Label = c("c_0_4",
"c_10_14", "c_15_19", "c_20_24", "c_25_29", "c_30_34", "c_35_39",
"c_40_44", "c_45_49", "c_5_9", "c_50_54", "c_55_59", "c_60_64",
"c_65_69", "c_70_74", "c_75_79", "c_80+", "c_f", "c_m", "c_total_sex",
"p_0_4", "p_10_14", "p_15_19", "p_20_24", "p_25_29", "p_30_34",
"p_35_39", "p_40_44", "p_45_49", "p_5_9", "p_50_54", "p_55_59",
"p_60_64", "p_65_69", "p_70_74", "p_75_79", "p_80+", "p_f", "p_m",
"p_total_sex"), class = "factor"), age_num = c(0L, 5L, 10L, 15L,
20L, 25L, 30L, 35L, 40L, 45L, 50L, 55L, 60L, 65L, 70L, 75L, 80L,
0L, 5L, 10L, 15L, 20L, 25L, 30L, 35L, 40L, 45L, 50L, 55L, 60L,
65L, 70L, 75L, 80L, 0L, 5L, 10L, 15L, 20L, 25L, 30L, 35L, 40L,
45L, 50L, 55L, 60L, 65L, 70L, 75L, 80L, 0L, 5L, 10L, 15L, 20L,
25L, 30L, 35L, 40L, 45L, 50L, 55L, 60L, 65L, 70L, 75L, 80L, 1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 0L, 5L, 10L, 15L,
20L, 25L, 30L, 35L, 40L, 45L, 50L, 55L, 60L, 65L, 70L, 75L, 80L,
0L, 5L, 10L, 15L, 20L, 25L, 30L, 35L, 40L, 45L, 50L, 55L, 60L,
65L, 70L, 75L, 80L, 0L, 5L, 10L, 15L, 20L, 25L, 30L, 35L, 40L,
45L, 50L, 55L, 60L, 65L, 70L, 75L, 80L, 0L, 5L, 10L, 15L, 20L,
25L, 30L, 35L, 40L, 45L, 50L, 55L, 60L, 65L, 70L, 75L, 80L, 1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 0L, 5L, 10L, 15L,
20L, 25L, 30L, 35L, 40L, 45L, 50L, 55L, 60L, 65L, 70L, 75L, 80L,
0L, 5L, 10L, 15L, 20L, 25L, 30L, 35L, 40L, 45L, 50L, 55L, 60L,
65L, 70L, 75L, 80L, 0L, 5L, 10L, 15L, 20L, 25L, 30L, 35L, 40L,
45L, 50L, 55L, 60L, 65L, 70L, 75L, 80L, 0L, 5L, 10L, 15L, 20L,
25L, 30L, 35L, 40L, 45L, 50L, 55L, 60L, 65L, 70L, 75L, 80L, 1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 0L, 5L, 10L, 15L,
20L, 25L, 30L, 35L, 40L, 45L, 50L, 55L, 60L, 65L, 70L, 75L, 80L,
0L, 5L, 10L, 15L, 20L, 25L, 30L, 35L, 40L, 45L, 50L, 55L, 60L,
65L, 70L, 75L, 80L, 0L, 5L, 10L, 15L, 20L, 25L, 30L, 35L, 40L,
45L, 50L, 55L, 60L, 65L, 70L, 75L, 80L, 0L, 5L, 10L, 15L, 20L,
25L, 30L, 35L, 40L, 45L, 50L, 55L, 60L, 65L, 70L, 75L, 80L, 1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), year = c(2006L, 2006L,
2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L,
2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L,
2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L,
2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L,
2006L, 2006L,
2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L,
2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L,
2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L,
2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L,
2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L,
2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L,
2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L,
2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L,
2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L,
2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L,
2006L, 2006L, 2006L, 2006L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L), value
= c(79.6, 55.1, 44.6, 44.3,
26.8, 9.5, 7.2, 6.5, 5.6, 2.4, 0.6, 5.2, 7.6, 10.4, 12, 13.5,
13.5, 42.4, 23.1, 14.7, 12.5, 3.9, 1.4, 2.4, 5, 4.2, 7, 7.6,
10.2, 9.5, 11.1, 12.1, 13.8, 14.1, 30.5, 18.1, 14.6, 7.6, 1.4,
3.3, 4.1, 6.9, 8, 9.9, 9.8, 13.5, 13.1, 14.1, 14.2, 14.6, 14.6,
60.1, 52.1, 52.5, 64.1, 45.5, 26.9, 10.6, 7.7, 8.7, 0.4, 0.5,
4.1, 8.8, 9.9, 12.4, 13.3, 14, 216.8, 227.6, 459.7, 115.8, 112.3,
243.5, 85, 87.9, 188.2, 241.6, 253.9, 510.8, 0.2, 0.15, 0.13,
0.13, 0.09, 0.053, 0.05, 0.05, 0.04, 0.03, 0.03, 0.024, 0, 0.01,
0.016, 0, 0, 0.22, 0.15, 0.12, 0.11, 0.07, 0.05, 0.05, 0.04,
0.04, 0.03, 0.03, 0.02, 0.02, 0.02, 0.01, 0.01, 0, 0.2, 0.19,
0.15, 0.11, 0.07, 0.06, 0.06, 0.04, 0.04, 0.03, 0.03, 0.01, 0.01,
0.01, 0.01, 0, 0, 0.14, 0.13, 0.13, 0.15, 0.12, 0.08, 0.05, 0.04,
0.05, 0.03, 0.03, 0.02, 0.01, 0.01, 0.01, 0, 0, 0.49, 0.51, 1,
0.51, 0.49, 1, 0.49, 0.51, 1, 0.49, 0.51, 1, 241.9, 175.54, 146.5,
138.46, 108.14, 73.94, 66.58, 64.78, 58.9, 43.86, 49.1, 36.5,
33.38, 25.54, 21.66, 18.42, 18.58, 243.74, 163.86, 130.22, 121.42,
96.1, 80.3, 63.9, 55.02, 49.02, 41.78, 51.74, 35.22, 32.66, 25.78,
23.06, 18.66, 18.14, 152.5, 109.9, 93.34, 82.62, 61.7, 56.06,
44.38, 38.26, 33.02, 29.58, 30.86, 21.86, 21.18, 17.62, 17.86,
15.86, 15.58, 196.82, 175.74, 180.46, 182.3, 153.22, 118.18,
81.34, 70.46, 65.82, 47.7, 54.66, 38.54, 29.42, 25.58, 20.38,
18.18, 17.18, 547.58, 566.78, 1100.38, 519.1, 522.78, 1028.06,
310.54, 322.26, 618.82, 619.62, 647.02, 1252.66, 0.206, 0.15,
0.126, 0.122, 0.088, 0.052, 0.05, 0.05, 0.04, 0.03, 0.032, 0.02,
0.02, 0.01, 0.01, 0, 0.002, 0.222, 0.15, 0.118, 0.108, 0.074,
0.054, 0.05, 0.04, 0.038, 0.028, 0.032, 0.02, 0.02, 0.018, 0.01,
0.008, 0, 0.23, 0.158, 0.142, 0.11, 0.074, 0.064, 0.056, 0.04,
0.038, 0.028, 0.03, 0.012, 0.01, 0.01, 0.01, 0, 0, 0.144, 0.132,
0.134, 0.14, 0.118, 0.082, 0.054, 0.042, 0.046, 0.028, 0.032,
0.02, 0.01, 0.01, 0.008, 0, 0, 0.49, 0.51, 1, 0.57, 0.43, 1,
0.4, 0.6, 1, 0.3, 0.7, 1)), .Names = c("region", "age", "age_num",
"year", "value"), class = "data.frame", row.names = c(NA, -320L))
Step 1: merge year and region in one variable (I work on dt, that you've dput-ed)
new.dt <- dt %>% mutate(regyear = paste(region, year))
Step 2: create data.frame with your p_m's and regyear only:
p.m.s<-new.dt %>%
filter(age=='p_m') %>%
select(regyear, value) %>%
rename(pm=value) # to avoid duplicated names in new.df and p.m.s
Step 3: the same with p_f's:
p.f.s<-new.dt %>% filter(age=='p_f') %>% select(regyear, value) %>% rename(pf=value)
Step 4: get what you need :)
new.dt %>%
left_join(p.m.s) %>% # add p_m's
left_join(p.f.s) %>% # add p_f's
mutate(value.m=value*pm, value.f=value*pf) %>%
select(-c(regyear,pm,pf)) # clean up
Hope this hepled!
Hi in the data you gave the variable type is called age. So be careful about this. According to your data you can accomplish that doing this
dt %>% join(dt %>% filter(age=="p_m" & region==region)
%>% select(region,value) %>% setNames(c("region","p_m")),by= "region")
%>% join(dt %>% filter(age=="p_f" & region==region) %>% select(region,value)
%>% setNames(c("region","p_f")),by= "region")
%>% mutate (value.m=value*p_m, value.f=value*p_f)
%>% select(-c(p_m,p_f))
This code filter p_m and p_f for each region and join with the original table.
Then use mutate to calculate the value, then drop the column p_m and p_f
I have a dataframe like this one:
> dput(df)
structure(list(OBBLIGATORIO = structure(c(2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("no",
"yes"), class = "factor"), COUNTRY = structure(c(16L, 16L, 16L,
16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L,
16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L,
16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L,
16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L,
16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L,
16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L,
16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L,
16L, 16L, 16L, 16L), .Label = c("Austria", "Belgium", "Bulgaria",
"Croatia", "Cyprus", "Czech Republic", "Denmark", "Estonia",
"Finland", "France", "Germany", "Greece", "Hungary", "Iceland",
"Ireland", "Italy", "Latvia", "Lithuania", "Luxembourg", "Malta",
"Norway", "Poland", "Portugal", "Romania", "Slovakia", "Slovenia",
"Spain", "Sweden", "United Kingdom of Great Britain and Northern Ireland"
), class = "factor"), YEAR = c(2003L, 2006L, 2007L, 2008L, 2009L,
2010L, 1995L, 1996L, 1997L, 1998L, 1999L, 2000L, 2001L, 2002L,
2003L, 2006L, 2007L, 2008L, 2009L, 2010L, 1995L, 1996L, 1997L,
1998L, 1999L, 2000L, 2001L, 2002L, 2003L, 2006L, 2007L, 2008L,
2009L, 2010L, 1995L, 1996L, 1997L, 1998L, 1999L, 2000L, 2001L,
2002L, 2003L, 2006L, 2007L, 2008L, 2009L, 2010L, 1995L, 1996L,
1997L, 1998L, 1999L, 2000L, 2001L, 2002L, 2003L, 2006L, 2007L,
2008L, 2009L, 2010L, 1995L, 1996L, 1997L, 1998L, 1999L, 2000L,
2001L, 2002L, 2003L, 2006L, 2007L, 2008L, 2009L, 2010L, 1995L,
1996L, 1997L, 1998L, 1999L, 2000L, 2001L, 2002L, 2003L, 2006L,
2007L, 2008L, 2009L, 2010L, 1995L, 1996L, 1997L, 1998L, 1999L,
2000L, 2001L, 2002L), AGE = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "Total", class = "factor"),
`CAUSE OF DEATH` = c("Acute poliomyelitis", "Acute poliomyelitis",
"Acute poliomyelitis", "Acute poliomyelitis", "Acute poliomyelitis",
"Acute poliomyelitis", "Acute poliomyelitis", "Acute poliomyelitis",
"Acute poliomyelitis", "Acute poliomyelitis", "Acute poliomyelitis",
"Acute poliomyelitis", "Acute poliomyelitis", "Acute poliomyelitis",
"Diphtheria", "Diphtheria", "Diphtheria", "Diphtheria", "Diphtheria",
"Diphtheria", "Diphtheria", "Diphtheria", "Diphtheria", "Diphtheria",
"Diphtheria", "Diphtheria", "Diphtheria", "Diphtheria", "Measles",
"Measles", "Measles", "Measles", "Measles", "Measles", "Measles",
"Measles", "Measles", "Measles", "Measles", "Measles", "Measles",
"Measles", "Tetanus", "Tetanus", "Tetanus", "Tetanus", "Tetanus",
"Tetanus", "Tetanus", "Tetanus", "Tetanus", "Tetanus", "Tetanus",
"Tetanus", "Tetanus", "Tetanus", "Tuberculosis", "Tuberculosis",
"Tuberculosis", "Tuberculosis", "Tuberculosis", "Tuberculosis",
"Tuberculosis", "Tuberculosis", "Tuberculosis", "Tuberculosis",
"Tuberculosis", "Tuberculosis", "Tuberculosis", "Tuberculosis",
"Viral hepatitis", "Viral hepatitis", "Viral hepatitis",
"Viral hepatitis", "Viral hepatitis", "Viral hepatitis",
"Viral hepatitis", "Viral hepatitis", "Viral hepatitis",
"Viral hepatitis", "Viral hepatitis", "Viral hepatitis",
"Viral hepatitis", "Viral hepatitis", "Whooping cough", "Whooping cough",
"Whooping cough", "Whooping cough", "Whooping cough", "Whooping cough",
"Whooping cough", "Whooping cough", "Whooping cough", "Whooping cough",
"Whooping cough", "Whooping cough", "Whooping cough", "Whooping cough"
), VALUE = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 4L, 2L, 2L, 2L, 1L, 1L, 6L, 7L, 7L, 1L, 2L,
3L, 2L, 5L, 12L, 9L, 13L, 9L, 13L, 8L, 17L, 14L, 16L, 18L,
15L, 19L, 11L, 10L, 25L, 24L, 21L, 22L, 23L, 20L, 34L, 32L,
31L, 30L, 29L, 28L, 27L, 26L, 41L, 42L, 43L, 45L, 46L, 47L,
33L, 35L, 36L, 37L, 38L, 39L, 40L, 44L, 1L, 2L, 1L, 1L, 1L,
2L, 2L, 2L, 1L, 3L, 1L, 1L, 1L, 1L), .Label = c("0", "1",
"2", "3", "6", "7", "9", "17", "18", "19", "21", "22", "27",
"28", "30", "31", "37", "41", "42", "301", "329", "333",
"344", "350", "396", "413", "415", "460", "517", "558", "597",
"609", "622", "647", "681", "1087", "1349", "1413", "1448",
"1499", "1576", "1654", "1725", "1948", "2531", "2665", "2757"
), class = "factor"), ID = 1:98), .Names = c("OBBLIGATORIO",
"COUNTRY", "YEAR", "AGE", "CAUSE OF DEATH", "VALUE", "ID"), row.names = c(NA,
-98L), class = "data.frame")
I want to obtain a chart that:
on x axis there are values from YEAR column
on y axis there are
values from VALUE column data are divided by CAUSE OF DEATH column
So something like:
I try:
x11()
ggplot(df, aes(x = df$`YEAR`, y = df$`VALUE`, fill = df$`CAUSE OF DEATH`, colour = df$`CAUSE OF DEATH`)) +
geom_density(alpha = 0.1) +
xlim(1995, 2010)
But the result is completely different from the one I want.
Thanks
I'm not sure what your actual question is, but one problem with your dataframe is that the VALUE column is currently defined as a factor, not as as a numeric. I think that remedying this will go a long way to solving your problem. I do this post-facto below (i.e. after the dataframe is already created), but if you are getting the data into R via a read.table() or similar command, you can specify the class of your columns at data frame creation time, which is probably a better approach.
In my code below I use the dplyr package for manipulating dataframes. It's quite powerful, but for this particular example it isn't doing anything that base R couldn't do.
require(ggplot2)
require(dplyr)
require(magrittr)
df <- ### YOUR dput output goes here ###
# fix the problem with the `VALUE` column
df %<>% mutate(VALUE = VALUE %>% as.character %>% as.numeric)
# equivalent in base R:
# df$VALUE <- as.numeric(as.character(df$VALUE))
# make a graph (is it the one you want?)
df %>% group_by(YEAR, `CAUSE OF DEATH`) %>%
summarize(value = sum(VALUE)) %>%
ggplot(aes(x = YEAR, y = value, color = `CAUSE OF DEATH`)) +
geom_line() +
theme_bw() +
geom_point()
# save graph for uploading to SO
ggsave('SO37230266.png')
The result is this graph:
I am making a bar plot using lattice in R where I have data for 4 different years on sources of irrigation for different states. using my code, the bar plot is coming fine but I wish the bar corresponding to the year 1996 to be plotted first followed by the bar corresponding to year 2001 etc. so as to show the increasing area being irrigated by tube-wells. However, I am unable to change the ordering. Here is my data and the R code. Many thanks for your help.
# sample data
irr_atlas <- structure(list(state = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), .Label = c("ANDHRA PRADESH",
"KARNATAKA", "MADHYA PRADESH", "RAJASTHAN"), class = "factor"),
st_code = c(28L, 28L, 28L, 28L, 28L, 28L, 28L, 28L, 28L,
28L, 28L, 28L, 28L, 28L, 28L, 28L, 29L, 29L, 29L, 29L, 29L,
29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 23L,
23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 23L,
23L, 23L, 23L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L), year = c(1996L, 1996L, 1996L, 1996L,
2001L, 2001L, 2001L, 2001L, 2006L, 2006L, 2006L, 2006L, 2011L,
2011L, 2011L, 2011L, 1996L, 1996L, 1996L, 1996L, 2001L, 2001L,
2001L, 2001L, 2006L, 2006L, 2006L, 2006L, 2011L, 2011L, 2011L,
2011L, 1996L, 1996L, 1996L, 1996L, 2001L, 2001L, 2001L, 2001L,
2006L, 2006L, 2006L, 2006L, 2011L, 2011L, 2011L, 2011L, 1996L,
1996L, 1996L, 1996L, 2001L, 2001L, 2001L, 2001L, 2006L, 2006L,
2006L, 2006L, 2011L, 2011L, 2011L, 2011L), irr_area = c(1.84066,
0.942819, 0.82886, 0.853502, 1.54922, 0.825659, 0.542492,
1.53412, 1.72969, 0.70271, 0.637221, 1.53894, 1.99893, 0.678425,
0.819829, 1.70708, 0.921594, 0.231669, 0.316999, 0.358529,
0.91339, 0.207157, 0.426549, 0.481061, 0.921255, 0.18192,
0.426145, 0.547193, 0.930802, 0.148065, 0.377149, 1.51843,
1.59425, 0.112145, 2.67683, 0.540054, 1.48056, 0.030502,
1.63696, 0.563948, 1.12595, 0.058667, 2.46494, 1.15004, 1.10444,
0.157069, 2.64378, 2.14177, 1.55814, 0.106623, 2.71347, 0.644683,
1.35746, 0.030586, 2.41845, 0.935234, 1.76933, 0.054374,
2.46197, 1.76918, 1.62587, 0.050299, 2.14737, 2.82708),irr_source = structure(c(1L,2L, 4L, 3L, 1L, 2L, 4L, 3L, 1L, 2L, 4L, 3L, 1L, 2L, 4L, 3L,
1L, 2L, 4L, 3L, 1L, 2L, 4L, 3L, 1L, 2L, 4L, 3L, 1L, 2L, 4L,
3L, 1L, 2L, 4L, 3L, 1L, 2L, 4L, 3L, 1L, 2L, 4L, 3L, 1L, 2L,
4L, 3L, 1L, 2L, 4L, 3L, 1L, 2L, 4L, 3L, 1L, 2L, 4L, 3L, 1L,
2L, 4L, 3L), .Label = c("Canal", "Tank", "Tube", "Well"), class = "factor")), .Names = c("state","st_code", "year", "irr_area", "irr_source"), class = "data.frame", row.names = c(NA, -64L))
Code for plot...
library(lattice)
barchart(~irr_area | factor(state) + factor(irr_source),
group=year, data=irr_atlas, auto.key=list(space="right"))
As mentioned, ordering of groups in R graphics is usually determined by the ordering of the factor variable. So, you can reorder your factors with factor and its levels argument.
library(lattice)
barchart(~irr_area | factor(state) + factor(irr_source),
group=factor(year, levels=sort(unique(year), decreasing=T)), # change the order of years
data=irr_atlas, auto.key=list(space="right"))
You can switch it back the other way by changing decreasing=F.
I'm trying to program something quite simple (I think) in R, but I can't seem to get it right. I have a dataset of 50 countries (1 to 50) for 15 years each and about 20 variables per country. For now I am only testing one variable (OS) on my dependent variable (SMD). I would like to do this with a loop country by country so I would get the output for each country in stead of the overall output.
I thought it would be wise to create a subset first (to be able to look at country 1 first, after which my loop should increase the number for country and test country 2). I believe my regression at the bottom of the page should give me the output for country 1 in stead of the overall score for the entire dataset. However I keep getting these errors:
> pdata <- plm.data(newdata, index=c("Country","Date"))
series are constants and have been removed
> pooling <- plm(Y ~ X, data=pdata, model= "pooling")
series Country, xRegion are constants and have been removed
Error in model.matrix.pFormula(formula, data, rhs = 1, model = model, :
NA in the individual index variable
> summary(pooling)
Error in summary(pooling) : object 'pooling' not found
I might be looking at this all wrong, but I believe that without getting this to work, there is no point in going further with programming the loop itself. Any advice on solving my errors, or other ways of programming a loop are really appreciated.
My code:
rm(list = ls())
mydata <- read.table(file = file.choose(), header = TRUE, dec = ",")
names(mydata)
attach(mydata)
Y <- cbind(SMD)
X <- cbind(OS)
newdata <- subset(mydata, Country %in% c(1))
newdata
pdata <- plm.data(newdata, index=c("Country","Date"))
pooling <- plm(Y ~ X, data=pdata, model= "pooling")
summary(pooling)
Edit: data sample of first 2 countries which causes same error
dput(mydata)
structure(list(Region = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), .Label = c("NAF", "SAME"), class = "factor"), Country = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L), Date = c(1995L, 1996L, 1997L, 1998L,
1999L, 2000L, 2001L, 2002L, 2003L, 2004L, 2005L, 2006L, 2007L,
2008L, 2009L, 2010L, 2011L, 2012L, 2013L, 2014L, 1995L, 1996L,
1997L, 1998L, 1999L, 2000L, 2001L, 2002L, 2003L, 2004L, 2005L,
2006L, 2007L, 2008L, 2009L, 2010L, 2011L, 2012L, 2013L, 2014L
), OS = structure(c(19L, 25L, 27L, 15L, 22L, 20L, 23L, 9L, 7L,
5L, 2L, 1L, 4L, 3L, 6L, 10L, 11L, 13L, 11L, 8L, 26L, 25L, 31L,
29L, 28L, 21L, 30L, 24L, 24L, 16L, 11L, 14L, 12L, 17L, 18L, 29L,
32L, 32L, 33L, 34L), .Label = c("51.5", "52.2", "55.6", "56.4",
"56.7", "57.7", "57.8", "58.3", "59", "59.2", "59.6", "59.9",
"60.2", "60.4", "61.1", "61.2", "62.2", "62.3", "62.8", "63.2",
"63.3", "63.8", "63.9", "64.2", "64.3", "64.5", "64.7", "65.3",
"65.5", "65.6", "66.4", "68", "69.6", "70.7"), class = "factor"),
SMD = structure(c(7L, 12L, 20L, 21L, 17L, 15L, 13L, 10L,
14L, 22L, 23L, 33L, 1L, 32L, 29L, 34L, 28L, 25L, NA, NA,
9L, 6L, 8L, 4L, 2L, 35L, 3L, 36L, 5L, 11L, 16L, 18L, 24L,
19L, 26L, 31L, 27L, 30L, NA, NA), .Label = c("100.3565662",
"13.44788845", "13.45858747", "13.56815534", "15.05892471",
"17.63789658", "18.04088718", "18.3101351", "19.34226196",
"21.25530884", "21.54423145", "23.75898948", "24.08770926",
"26.39817342", "29.44079001", "31.40605191", "34.46667996",
"34.52913657", "35.66070947", "36.4419931", "39.16875621",
"44.0126137", "45.72949566", "49.13062679", "54.83730247",
"56.87886311", "59.80971583", "60.5658962", "69.20148901",
"70.91362874", "72.64845214", "73.97139238", "75.20140919",
"76.18378138", "9.570435019", "9.867635305"), class = "factor")), .Names = c("Region",
"Country", "Date", "OS", "SMD"), class = "data.frame", row.names = c(NA,
-40L))
Are you sure you need to use plm?? This produces a list of summaries by country.
# convert factors to numeric
mydata$SMD <- as.numeric(mydata$SMD)
mydata$OS <- as.numeric(mydata$OS)
# Using lapply(...)
smry <- lapply(unique(mydata$Country),
function(cntry)
summary(lm(SMD~OS,data=mydata[mydata$Country==cntry,])))
# Same thing, using for loop
smry <- list()
for (cntry in unique(mydata$Country)) {
smry <- list(smry,
summary(lm(SMD~OS,data=mydata[mydata$Country==cntry,])))
}
In your dataset, SMD and OS are factors, which need to be converted to numeric first.