Related
I have received an data frame for analysis, each observation is a row, with 120 variables. Unfortunately I have not received an observation ID variable that uniquely identifies each observations.
I was thinking maybe I could concatenate all columns to a string and hash this string to obtain a unique ID.
How can I do this without specifying all variables like with paste(). Or is there another solution?
The data can contain NA
here is the sample dataset
structure(list(Class = structure(c(1L, 2L, 3L, 4L, 1L, 2L, 3L,
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L,
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L), levels = c("1st", "2nd",
"3rd", "Crew"), class = "factor"), Sex = structure(c(1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), levels = c("Male",
"Female"), class = "factor"), Age = structure(c(1L, NA, 1L, NA,
1L, NA, 1L, 1L, 2L, 2L, NA, 2L, 2L, 2L, 2L, NA, 1L, 1L, 1L, NA,
NA, 1L, 1L, 1L, NA, 2L, 2L, 2L, 2L, 2L, 2L, NA), levels = c("Child",
"Adult"), class = "factor"), Survived = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), levels = c("No",
"Yes"), class = "factor"), Freq = c(0, 0, 35, 0, 0, 0, 17, 0,
118, 154, 387, 670, 4, 13, 89, 3, 5, 11, 13, 0, 1, 13, 14, 0,
57, 14, 75, 192, 140, 80, 76, 20)), row.names = c(NA, -32L), class = "data.frame")
Maybe you want to use the unique_identifier function from the udpipe package which does:
Create a unique identifier for each combination of fields in a data
frame. This unique identifier is unique for each combination of the
elements of the fields. The generated identifier is like a primary key
or a secondary key on a table. This is just a small wrapper around
frank
Here reproducible example:
df <- structure(list(Class = structure(c(1L, 2L, 3L, 4L, 1L, 2L, 3L,
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L,
4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L), levels = c("1st", "2nd",
"3rd", "Crew"), class = "factor"), Sex = structure(c(1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), levels = c("Male",
"Female"), class = "factor"), Age = structure(c(1L, NA, 1L, NA,
1L, NA, 1L, 1L, 2L, 2L, NA, 2L, 2L, 2L, 2L, NA, 1L, 1L, 1L, NA,
NA, 1L, 1L, 1L, NA, 2L, 2L, 2L, 2L, 2L, 2L, NA), levels = c("Child",
"Adult"), class = "factor"), Survived = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), levels = c("No",
"Yes"), class = "factor"), Freq = c(0, 0, 35, 0, 0, 0, 17, 0,
118, 154, 387, 670, 4, 13, 89, 3, 5, 11, 13, 0, 1, 13, 14, 0,
57, 14, 75, 192, 140, 80, 76, 20)), row.names = c(NA, -32L), class = "data.frame")
library(udpipe)
#> Warning: package 'udpipe' was built under R version 4.1.2
df$ID <- unique_identifier(df, fields = colnames(df))
df
#> Class Sex Age Survived Freq ID
#> 1 1st Male Child No 0 1
#> 2 2nd Male <NA> No 0 12
#> 3 3rd Male Child No 35 17
#> 4 Crew Male <NA> No 0 27
#> 5 1st Female Child No 0 5
#> 6 2nd Female <NA> No 0 16
#> 7 3rd Female Child No 17 21
#> 8 Crew Female Child No 0 29
#> 9 1st Male Adult No 118 3
#> 10 2nd Male Adult No 154 10
#> 11 3rd Male <NA> No 387 20
#> 12 Crew Male Adult No 670 25
#> 13 1st Female Adult No 4 6
#> 14 2nd Female Adult No 13 14
#> 15 3rd Female Adult No 89 23
#> 16 Crew Female <NA> No 3 31
#> 17 1st Male Child Yes 5 2
#> 18 2nd Male Child Yes 11 9
#> 19 3rd Male Child Yes 13 18
#> 20 Crew Male <NA> Yes 0 28
#> 21 1st Female <NA> Yes 1 8
#> 22 2nd Female Child Yes 13 13
#> 23 3rd Female Child Yes 14 22
#> 24 Crew Female Child Yes 0 30
#> 25 1st Male <NA> Yes 57 4
#> 26 2nd Male Adult Yes 14 11
#> 27 3rd Male Adult Yes 75 19
#> 28 Crew Male Adult Yes 192 26
#> 29 1st Female Adult Yes 140 7
#> 30 2nd Female Adult Yes 80 15
#> 31 3rd Female Adult Yes 76 24
#> 32 Crew Female <NA> Yes 20 32
Created on 2022-07-24 by the reprex package (v2.0.1)
Another option is to use unclass on factors (i.e., after pasting all columns together using Reduce), which will convert the factors to their numbers.
df$ID <- c(unclass(as.factor(Reduce(paste, df))))
Output
Class Sex Age Survived Freq ID
1 1st Male Child No 0 6
2 2nd Male <NA> No 0 16
3 3rd Male Child No 35 22
4 Crew Male <NA> No 0 31
5 1st Female Child No 0 3
6 2nd Female <NA> No 0 12
7 3rd Female Child No 17 19
8 Crew Female Child No 0 25
9 1st Male Adult No 118 5
10 2nd Male Adult No 154 13
11 3rd Male <NA> No 387 24
12 Crew Male Adult No 670 29
13 1st Female Adult No 4 1
14 2nd Female Adult No 13 9
15 3rd Female Adult No 89 17
16 Crew Female <NA> No 3 27
17 1st Male Child Yes 5 7
18 2nd Male Child Yes 11 15
19 3rd Male Child Yes 13 23
20 Crew Male <NA> Yes 0 32
21 1st Female <NA> Yes 1 4
22 2nd Female Child Yes 13 11
23 3rd Female Child Yes 14 20
24 Crew Female Child Yes 0 26
25 1st Male <NA> Yes 57 8
26 2nd Male Adult Yes 14 14
27 3rd Male Adult Yes 75 21
28 Crew Male Adult Yes 192 30
29 1st Female Adult Yes 140 2
30 2nd Female Adult Yes 80 10
31 3rd Female Adult Yes 76 18
32 Crew Female <NA> Yes 20 28
I have two main issues I could use some help getting resolved.
1.) There are odd lines at the base of my columns which I am not sure how to get rid of.
2.) I am running into overlap with the columns when I graph. (I think this has something do do with the position_dodge(width= XXX) but not totally sure).
Attached an image of an example plot, mainly because I am not sure how to describe what is happening at the base of the plot.
The following code is being used.
where_2 <- where %>%
group_by_("gender", "radio") %>%
summarise(count = n()) %>%
mutate(perc = (perc = (count / sum(count) * 100)))
gg <- ggplot(where_2, aes_string(x = names(where_2[1]), y = where_2$perc, fill = "radio"))
gg <- gg + geom_bar(aes(y = (..count..) / sum(..count..)))
gg <-gg + geom_bar(position = position_dodge(.5),stat = "identity", width = .75)
#gg <- gg + scale_y_continuous(labels = scales::percent)
gg <- gg + xlab(paste0(lab5[2, title]))
gg <- gg + scale_fill_discrete(labels = c("Yes", "No"))
print(gg)
I have been running in a wall for the past 4 days with this question any help would be appreciated.
place gender Radio
1 Male No
1 Female Yes
1 Male No
1 Female Yes
1 Male Yes
1 Male Yes
1 Female Yes
1 Female Yes
1 Male Yes
1 Female No
1 Male Yes
1 Male Yes
1 Male No
1 Female No
1 Female Yes
1 Female Yes
1 Female No
1 Male Yes
1 Female No
1 Female Yes
1 Female No
1 Female Yes
1 Male No
1 Male No
1 Female No
1 Male No
1 Female No
1 Female No
1 Female No
1 Male Yes
1 Female No
1 Female No
1 Female Yes
1 Male No
1 Male Yes
1 Female No
2 Male Yes
2 Male Yes
2 Female No
2 Female No
2 Male Yes
2 Female No
2 Male No
2 Male Yes
2 Female No
2 Female No
2 Female No
2 Male No
2 Female No
2 Male No
2 Female Yes
2 Female Yes
2 Male Yes
2 Male No
2 Male Yes
3 Female No
3 Male Yes
3 Female No
3 Male No
3 Male Yes
3 Female No
3 Female Yes
3 Male No
3 Male Yes
3 Female Yes
3 Male No
3 Female No
3 Female Yes
3 Female No
3 Female Yes
3 Female No
3 Male Yes
3 Female No
3 Female No
4 Male Yes
4 Female No
4 Female Yes
4 Female Yes
4 Male Yes
4 Female No
4 Female No
4 Male No
4 Female No
4 Female No
4 Female No
4 Male Yes
4 Male Yes
4 Female Yes
4 Female No
4 Male Yes
4 Male Yes
4 Male Yes
4 Female No
4 Female No
4 Female No
Try this:
gg <- ggplot(where2,
aes(x = gender, y = perc, fill = Radio)) +
geom_col(position = "dodge", width = .75)
print(gg)
Explanation below:
You are right that the "feet" are indeed caused by geom_bar(aes(y = (..count..) / sum(..count..))). I'm not sure why you included it in the first place, but here's why it created the "feet":
Good chart
p <- ggplot(where2, aes(x = gender, y = perc, fill = Radio))
p + geom_col(position = position_dodge(0.5), width = 0.75)
Above is the chart you want to get (I assume). geom_col() is equivalent to geom_bar(stat = "identity") with less typing, so I used that instead.
Usually people set the same value in position_dodge() and width =, which would avoid the overlapped look. I've retained it for now to contrast with the "feet" below.
Notice also the values on the y-axis. They range from 0 to 60+.
Bad chart
p + geom_bar(aes(y = (..count..) / sum(..count..)))
Above is the chart of the "feet", now occupying the entire plot's height. Here, ..count.. returns the number of rows for each combination of gender & Radio, while sum(..count..) returns the total number of rows in the data frame. The data frame, where2, has 4 rows, one for each combination, so the y value associated with each bar is 0.25, and the stacked height of each gender's two bars is 0.5.
I consider this the bad chart, because the visualisation is useless. When you have already counted the number of rows in your dataset yourself (going from where to where2), it's not necessary for ggplot to do it again.
Good chart + bad chart = weird chart
p +
geom_col(position = position_dodge(0.5), width = 0.75) +
geom_bar(aes(y = (..count..) / sum(..count..)))
Above is the combined chart with both layers. Now the bad chart's bars are squeezed all the way to the bottom, since their combined height is only 0.5, while the good chart's bars stretch all the way to 60+.
data used:
> dput(where)
structure(list(place = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L), gender = structure(c(2L, 1L, 2L, 1L,
2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L,
1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L,
2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L,
2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L,
1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L,
1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L), .Label = c("Female",
"Male"), class = "factor"), Radio = structure(c(1L, 2L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L,
2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor")), .Names = c("place", "gender", "Radio"
), class = "data.frame", row.names = c(NA, -95L))
where2 <- where %>%
group_by(gender, Radio) %>%
summarise(count = n()) %>%
mutate(perc = (perc = (count / sum(count) * 100)))
> where2
# A tibble: 4 x 4
# Groups: gender [2]
gender Radio count perc
<fctr> <fctr> <int> <dbl>
1 Female No 37 67.3
2 Female Yes 18 32.7
3 Male No 15 37.5
4 Male Yes 25 62.5
I have asked this question earlier and received a reply which was not in accordance with my wish. At the time I used stata to do the job. However as I routinely work with such data, I wish to use R to create what I wanted. I have a data set of daily hospital admission by age, sex and diagnoses. I wish to aggregate and reshape the data from long to wide. How could I achieve this objective? Sample data and required output are shown below. The column headers designate prefix of sex, age and diagnoses.
Thanks
Sample data
structure(list(diag = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L), .Label = c("card", "cere"), class = "factor"), sex = structure(c(1L,
1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L,
1L, 2L, 2L, 1L, 1L, 2L, 2L), .Label = c("Female", "Male"), class = "factor"),
age = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("35-64",
"65-74"), class = "factor"), admissions = c(1L, 1L, 0L, 0L,
6L, 6L, 6L, 1L, 4L, 0L, 0L, 0L, 4L, 6L, 5L, 2L, 2L, 4L, 1L,
0L, 6L, 5L, 6L, 4L), bdate = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L), .Label = c("1987-01-01", "1987-01-02",
"1987-01-03"), class = "factor")), .Names = c("diag", "sex",
"age", "admissions", "bdate"), row.names = c(NA, -24L), class = "data.frame")
Required output
structure(list(date = structure(1:3, .Label = c("01jan1987",
"02jan1987", "03jan1987"), class = "factor"), f3564card = c(1L,
4L, 2L), f6574card = c(1L, 0L, 4L), m3564card = c(0L, 0L, 1L),
m6574card = c(0L, 0L, 0L), f3564cere = c(6L, 4L, 6L), f6574cere = c(6L,
6L, 5L), m3564cere = c(6L, 5L, 6L), m6574cere = c(1L, 2L,
4L)), .Names = c("date", "f3564card", "f6574card", "m3564card",
"m6574card", "f3564cere", "f6574cere", "m3564cere", "m6574cere"
), class = "data.frame", row.names = c(NA, -3L))
Your data are already in a long format that can be used easily by "reshape2", like this:
library(reshape)
dcast(df, bdate ~ sex + age + diag, value.var = "admissions")
# bdate Female_35-64_card Female_35-64_cere Female_65-74_card Female_65-74_cere
# 1 1987-01-01 1 6 1 6
# 2 1987-01-02 4 4 0 6
# 3 1987-01-03 2 6 4 5
# Male_35-64_card Male_35-64_cere Male_65-74_card Male_65-74_cere
# 1 0 6 0 1
# 2 0 5 0 2
# 3 1 6 0 4
I don't see any aggregation in your sample output, but if aggregation is required, you can achieve this with the fun.aggregate function within dcast.
df <- read.table("D:/Programacao/R/Stackoverflow/Nova pasta/sample.csv",
head = T, dec = '.', sep = ',',
stringsAsFactors = F)
head(df)
date sex cvd ACS age
1 01 Jul 91 female 0 0 35-64
2 01 Jul 91 male 0 0 35-64
3 01 Jul 91 female 0 0 35-64
4 01 Jul 91 male 1 1 35-64
5 01 Jul 91 female 0 0 65-74
6 02 Jul 91 male 0 0 65-74
Consdering that cvd and ACS are not mutually exclusive to males and females respectively,
library(dplyr)
df %.%
group_by(date, sex, age) %.%
summarise(vcvd = sum(cvd),
vacs = sum(ACS))
Source: local data frame [111 x 5]
Groups: date, sex
date sex age vcvd vacs
1 01 Jul 91 female 35-64 0 0
2 01 Jul 91 female 65-74 0 0
3 01 Jul 91 male 35-64 1 1
4 02 Aug 91 female 35-64 0 0
5 02 Jul 91 female 65-74 1 0
6 02 Jul 91 male 65-74 0 0
7 03 Aug 91 female 65-74 0 0
8 03 Jul 91 female 35-64 0 0
9 04 Jul 91 male 35-64 1 0
10 04 Jul 91 male 65-74 0 0
.. ... ... ... ... ...
I have gotten frustrated trying to solve this seemingly simple problem. I have a dataset (df) like this:
structure(list(Year = c(2015L, 2015L, 2015L, 2015L, 2015L, 2015L,
2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L,
2015L, 2015L, 2015L, 2015L, 2015L), Unknown = c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), Temp = c(21L, 21L, 21L, 23L, 23L, 21L, 21L, 22L, 21L, 23L,
23L, 22L, 21L, 21L, 22L, 22L, 21L, 21L, 23L, 23L), Obs = structure(c(1L,
1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L,
1L, 2L, 2L), .Label = c("mdk", "sde"), class = "factor"), State = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = "ma", class = "factor"), Zone = c(2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L), Segment = c(8L, 7L, 4L, 17L, 18L, 7L, 2L, 12L, 1L, 17L,
18L, 12L, 9L, 7L, 13L, 11L, 8L, 9L, 17L, 18L), Subseg = c(1L,
3L, 3L, 2L, 2L, 2L, 4L, 0L, 10L, 4L, 2L, 0L, 1L, 1L, 3L, 1L,
2L, 2L, 1L, 1L), Wdir = structure(c(2L, 2L, 1L, 3L, 3L, 2L, 2L,
1L, 2L, 3L, 3L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L), .Label = c("na",
"ne", "nw"), class = "factor"), Wvel = structure(c(1L, 1L, 2L,
1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L,
2L), .Label = c("5", "na"), class = "factor"), Clouds = structure(c(1L,
1L, 3L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 3L, 1L, 1L, 3L, 3L, 1L,
1L, 3L, 3L), .Label = c("1", "4", "na"), class = "factor"), Temp.1 = structure(c(1L,
1L, 3L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 3L, 1L, 1L, 3L, 3L, 1L,
1L, 3L, 3L), .Label = c("20", "25", "na"), class = "factor"),
Species = structure(c(7L, 21L, 1L, 21L, 16L, 4L, 16L, 6L,
1L, 17L, 5L, 7L, 5L, 1L, 1L, 6L, 7L, 7L, 24L, 5L), .Label = c("ABDU",
"ABDU", "ABDU", "ABDU", "ABDU", "CAGO", "CAGO", "CAGO", "CAGO",
"CAGO", "GOLD", "GOLD", "GOLD", "GOLD", "GOLD", "MERG", "MERG",
"MERG", "MERG", "MERG", "SCOT", "SCOT", "SCOT", "SCOT",
"SCOT", "SCOT", "SCOT"), class = "factor"), Count = c(5L,
1L, 150L, 3L, 20L, 8L, 5L, 10L, 5L, 1L, 20L, 10L, 2L, 2L,
80L, 40L, 1L, 1000L, 2L, 20L)), .Names = c("Year", "Unknown",
"Temp", "Obs", "State", "Zone", "Segment", "Subseg", "Wdir",
"Wvel", "Clouds", "Temp.1", "Species", "Count"), row.names = c(666L,
614L, 2060L, 1738L, 1459L, 536L, 197L, 2467L, 98L, 1794L, 1449L,
2464L, 696L, 483L, 2644L, 2350L, 686L, 844L, 2989L, 2934L), class = "data.frame")
With a header that looks like this:
Year Unknown Temp Obs State Zone Segment Subseg Wdir Wvel
666 2015 1 21 mdk ma 2 8 1 ne 5
614 2015 1 21 mdk ma 2 7 3 ne 5
2060 2015 1 21 sde ma 2 4 3 na na
1738 2015 1 23 mdk ma 2 17 2 nw 5
1459 2015 1 23 mdk ma 2 18 2 nw 5
536 2015 1 21 mdk ma 2 7 2 ne 5
Clouds Temp.1 Species Count
666 1 20 CAGO 5
614 1 20 SCOT 1
2060 na na ABDU 150
1738 1 20 SCOT 3
1459 1 20 MERG 20
536 1 20 ABDU 8
Among other things within dplyr, I want to get a sum of each species as a new column, when I am grouping by segment. This is the final code I have tried with many variations.
df_group = df %>%
group_by(Segment) %>%
summarise(temp = round(mean(Temp)),
WDir = round(mean(Wdir)),
ABDU = sum(which(Species=="ABDU"),Count),
CAGO = sum(which(Species=="CAGO"),Count),
GOLD = sum(which(Species=="GOLD"),Count),
MERG = sum(which(Species=="MERG"),Count),
SCOT = sum(which(Species=="SCOT"),Count))
And this is what I get (to show correct format):
Segment temp WDir ABDU CAGO GOLD MERG SCOT
1 1 21 2 6 5 5 5 5
2 2 21 2 5 5 5 6 5
3 4 21 1 151 150 150 150 150
4 7 21 2 16 11 11 11 12
5 8 21 2 6 9 6 6 6
6 9 21 2 1003 1004 1002 1002 1002
The format and general idea are what I want, but the numbers are not adding up the way I want them to. I'm sure it is simple but need some help! Thanks.
The problem is that which returns a vector of the positions, but you're not using those to subset. So the sum you are getting is of the positions which are true in addition to the count variable. e.g.
x <- c("a", "b", "b")
count <- c(10, 11, 12)
sum(which(c("a", "b", "b") == "b"), count)
# 38 because it is 2 + 3 + 10 + 11 + 12
I believe what you want is (or at least one way of writing it):
sum(ifelse(x == "b", count, 0))
# 23 because it is equal to 0 + 11 + 12
Translating into dplyr syntax, your example could look like this:
df_group = df %>%
group_by(Segment) %>%
summarise(temp = round(mean(Temp)),
WDir = round(mean(Wdir)),
ABDU = sum(ifelse(Species=="ABDU", Count, 0L)),
CAGO = sum(ifelse(Species=="CAGO", Count, 0L)),
GOLD = sum(ifelse(Species=="GOLD", Count, 0L)),
MERG = sum(ifelse(Species=="MERG", Count, 0L)),
SCOT = sum(ifelse(Species=="SCOT", Count, 0L)))
Another approach, in case you don't want to type out the sum for all your species:
library(reshape2)
library(dplyr)
# I had a problem with duplicate factor levels from your dput,
# so I re-factored species
df$Species = as.factor(as.character(df$Species))
species.counts = select(df, Segment, Species, Count) %>%
dcast(formula = Segment ~ Species, value.var = "Count", fun.aggregate = sum)
> head(species.counts)
Segment ABDU CAGO MERG SCOT
1 1 5 0 0 0
2 2 0 0 5 0
3 4 150 0 0 0
4 7 10 0 0 1
5 8 0 6 0 0
6 9 2 1000 0 0
df %>% group_by(Segment) %>%
summarise(temp = round(mean(Temp))) %>%
left_join(species.counts)
Source: local data frame [11 x 6]
Segment temp ABDU CAGO MERG SCOT
1 1 21 5 0 0 0
2 2 21 0 0 5 0
3 4 21 150 0 0 0
4 7 21 10 0 0 1
5 8 21 0 6 0 0
6 9 21 2 1000 0 0
I also couldn't do the wind direction average, because your dput data only has that as a factor with the directions, not like the head() you showed, but the technique generalizes.
I have two data frames
df1 <- structure(list(g1 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("A", "B"), class = "factor"), g2 = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L), .Label = c("a", "b", "c"), class = "factor"), val1 = 1:20, val2 = c(1L, 2L, 3L, 4L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 4L, 1L, 2L, 3L)), .Names = c("g1", "g2", "val1", "val2"), row.names = c(NA, -20L), class = "data.frame")
df2 <- structure(list(g1 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("A", "B"), class = "factor"), g2 = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L), .Label = c("a", "b", "c"), class = "factor"), val3 = c(5L, 6L, 7L, 3L, 4L, 5L, 2L, 3L, 4L, 8L, 9L, 10L, 4L, 5L, 6L, 5L, 6L)), .Names = c("g1", "g2", "val3"), row.names = c(NA, -17L), class = "data.frame")
> df1
g1 g2 val1 val2
1 A a 1 1
2 A a 2 2
3 A a 3 3
4 A a 4 4
5 A b 5 1
6 A b 6 2
7 A b 7 3
8 A c 8 1
9 A c 9 2
10 A c 10 3
11 B a 11 1
12 B a 12 2
13 B a 13 3
14 B b 14 1
15 B b 15 2
16 B b 16 3
17 B b 17 4
18 B c 18 1
19 B c 19 2
20 B c 20 3
> df2
g1 g2 val3
1 A a 5
2 A a 6
3 A a 7
4 A b 3
5 A b 4
6 A b 5
7 A c 2
8 A c 3
9 B c 4
10 B a 8
11 B a 9
12 B a 10
13 B b 4
14 B b 5
15 B b 6
16 B c 5
17 B c 6
My aim is to rescale df1$val2 to take values between the min and max values of df2$val3 within the respective groups.
I tried this:
library(dplyr)
df1 <- df1 %.% group_by(g1, g2) %.% mutate(rescaled=(max(df2$val3)-min(df2$val3))*(val2-min(val2))/(max(val2)-min(val2))+min(df2$val3))
But the output is different from what I expect. The problem is that I can neither cbind nor merge the two data frames due to their different lengths. Any hints?
Does this work?
library(plyr)
df3 <- ddply(df2, .(g1, g2), summarize, max.val=max(val3), min.val=min(val3))
merged.df <- merge(df1, df3, by=c("g1", "g2"), all.x=TRUE)
## Now rescale merged.df$val2 as desired