Related
I have a question that is somewhat similar to others that have been posted, but after looking thoroughly at several posts, I can't get the code to work. Any help would be much appreciated.
My data frame looks like, this:
'data.frame': 501 obs. of 5 variables:
$ Tattoo.MUM : Factor w/ 250 levels "1004","1007",..: 76 76 76 81 81 81 85 85 85 85 ...
$ OffspringMUMs: int 4 4 4 4 4 4 11 11 11 11 ...
$ YearBIRTH.CUB: int 1988 1990 1991 1988 1991 2007 1989 1991 1992 1993 ...
$ YearBIRTH.MUM: int 1991 1991 NA NA NA NA 1987 1987 1987 1987 ...
$ OFFSpYR : int 2 1 1 1 2 1 1 4 3 3 ...
A few lines here:
structure(list(Tattoo.MUM = structure(c(6L, 6L, 6L, 6L, 7L, 7L,
7L, 8L, 9L, 11L, 11L, 11L, 11L, 5L, 1L, 4L, 2L, 3L, 3L, 10L,
10L, 10L, 10L, 10L), .Label = c("10454", "1045A", "1045X", "12392",
"1601", "22", "27", "29", "41", "424X", "60"), class = "factor"),
OffspringMUMs = c(11L, 11L, 11L, 11L, 5L, 5L, 5L, 1L, 3L,
7L, 7L, 7L, 7L, 1L, 2L, 1L, 1L, 4L, 4L, 6L, 6L, 6L, 6L, 6L
), YearBIRTH.CUB = c(1989L, 1991L, 1992L, 1993L, 1990L, 1991L,
1993L, 1989L, 1988L, 1988L, 1989L, 1991L, 1994L, 2015L, 2012L,
2015L, 2005L, 2009L, 2010L, 1996L, 1998L, 2000L, 2001L, 2006L
), YearBIRTH.MUM = c(1987L, 1987L, 1987L, 1987L, NA, NA,
NA, NA, NA, 1987L, 1987L, 1987L, 1987L, NA, NA, NA, NA, 2005L,
2005L, 1994L, 1994L, 1994L, 1994L, 1994L), OFFSpYR = c(1L,
4L, 3L, 3L, 1L, 1L, 3L, 1L, 3L, 3L, 1L, 2L, 1L, 1L, 2L, 1L,
1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L)), .Names = c("Tattoo.MUM",
"OffspringMUMs", "YearBIRTH.CUB", "YearBIRTH.MUM", "OFFSpYR"), class = "data.frame", row.names = c(NA,
-24L))
I want to add new rows for all missing years (YearBIRTH.CUB) in Tattoo.MUM keeping the rest of the values the same and adding '0' to OFFSpYR.
Like so:
structure(list(Tattoo.MUM = structure(c(6L, 6L, 6L, 6L, 6L, 7L,
7L, 7L, 7L, 8L, 9L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 5L, 1L,
4L, 2L, 3L, 3L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L,
10L, 10L), .Label = c("10454", "1045A", "1045X", "12392", "1601",
"22", "27", "29", "41", "424X", "60"), class = "factor"), OffspringMUMs = c(11L,
11L, 11L, 11L, 11L, 5L, 5L, 5L, 5L, 1L, 3L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 1L, 2L, 1L, 1L, 4L, 4L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L), YearBIRTH.CUB = c(1989L, 1990L, 1991L, 1992L, 1993L,
1990L, 1991L, 1992L, 1993L, 1989L, 1988L, 1988L, 1989L, 1990L,
1991L, 1992L, 1993L, 1994L, 2015L, 2012L, 2015L, 2005L, 2009L,
2010L, 1996L, 1997L, 1998L, 1999L, 2000L, 2001L, 2002L, 2003L,
2004L, 2005L, 2006L), YearBIRTH.MUM = c(1987L, 1987L, 1987L,
1987L, 1987L, NA, NA, NA, NA, NA, NA, 1987L, 1987L, 1987L, 1987L,
1987L, 1987L, 1987L, NA, NA, NA, NA, 2005L, 2005L, 1994L, 1994L,
1994L, 1994L, 1994L, 1994L, 1994L, 1994L, 1994L, 1994L, 1994L
), OFFSpYR = c(1L, 0L, 4L, 3L, 3L, 1L, 1L, 0L, 3L, 1L, 3L, 3L,
1L, 0L, 2L, 0L, 0L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 0L, 1L, 0L,
1L, 2L, 0L, 0L, 0L, 0L, 1L)), .Names = c("Tattoo.MUM", "OffspringMUMs",
"YearBIRTH.CUB", "YearBIRTH.MUM", "OFFSpYR"), class = "data.frame", row.names = c(NA,
-35L))
I've tried:
library(tidyr)
library(dplyr)
df1 <- pedMUM %>% group_by(Tattoo.MUM, OffspringMUMs) %>% complete(YearBIRTH.CUB = full_seq(YearBIRTH.CUB,1)) %>% fill(OFFSpYR=0)
library(data.table)
df1 <- setDT(pedMUM)[CJ(Tattoo.MUM=Tattoo.MUM, OffspringMUMs=OffspringMUMs, YearBIRTH.MUM=YearBIRTH.MUM, YearBIRTH.CUB=seq(min(YearBIRTH.CUB), max(YearBIRTH.CUB)), unique=TRUE),
on=.(Tattoo.MUM, OffspringMUMs, YearBIRTH.CUB), roll=T]
I am obviously using tidyr, dplyr, and data.table wrongly because none have given me the results I want.
I've had a look at the following posts:
Add rows with missing years by group
Adding rows with values of "0" to a dataframe with missing data
Find missing month after grouping with dplyr
And even tried loops:
R code - clever loop to add rows
but I get confused when I try to determine the year sequence for each Tattoo.MUM within the loop.
Would anyone be able to point me in the right direction?
I haven't used complete() before, but the following seems to work. nesting() allows you to keep two variables together, =full_seq() allows you to expand the values of a variable, fill=list() allows you to fill in blanks.
pedMUM <- structure(list(Tattoo.MUM = structure(c(6L, 6L, 6L, 6L, 7L, 7L,
7L, 8L, 9L, 11L, 11L, 11L, 11L, 5L, 1L, 4L, 2L, 3L, 3L, 10L,
10L, 10L, 10L, 10L), .Label = c("10454", "1045A", "1045X", "12392",
"1601", "22", "27", "29", "41", "424X", "60"), class = "factor"),
OffspringMUMs = c(11L, 11L, 11L, 11L, 5L, 5L, 5L, 1L, 3L,
7L, 7L, 7L, 7L, 1L, 2L, 1L, 1L, 4L, 4L, 6L, 6L, 6L, 6L, 6L
), YearBIRTH.CUB = c(1989L, 1991L, 1992L, 1993L, 1990L, 1991L,
1993L, 1989L, 1988L, 1988L, 1989L, 1991L, 1994L, 2015L, 2012L,
2015L, 2005L, 2009L, 2010L, 1996L, 1998L, 2000L, 2001L, 2006L
), YearBIRTH.MUM = c(1987L, 1987L, 1987L, 1987L, NA, NA,
NA, NA, NA, 1987L, 1987L, 1987L, 1987L, NA, NA, NA, NA, 2005L,
2005L, 1994L, 1994L, 1994L, 1994L, 1994L), OFFSpYR = c(1L,
4L, 3L, 3L, 1L, 1L, 3L, 1L, 3L, 3L, 1L, 2L, 1L, 1L, 2L, 1L,
1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L)), .Names = c("Tattoo.MUM",
"OffspringMUMs", "YearBIRTH.CUB", "YearBIRTH.MUM", "OFFSpYR"), class = "data.frame", row.names = c(NA,
-24L))
library(tidyr)
library(dplyr)
df1 <- pedMUM %>%
group_by(Tattoo.MUM) %>% # find min and max year for each mum
mutate(
minyear=min(YearBIRTH.CUB, na.rm=TRUE),
maxyear=max(YearBIRTH.CUB, na.rm=TRUE)
) %>%
complete( # complete table
nesting(Tattoo.MUM, minyear, maxyear, OffspringMUMs, YearBIRTH.MUM),
YearBIRTH.CUB=full_seq(YearBIRTH.CUB, 1),
fill=list(OFFSpYR=0)
) %>%
filter(YearBIRTH.CUB>=minyear & YearBIRTH.CUB<=maxyear) %>% # remove unwanted years
select(names(pedMUM)) # return original column order
I am trying to generate a conditional dummy variable ”X" with the following rule
set X=1 if Y is =1, two years prior to the NA.
In other words, X=1/0 depending on [0/1=year1,0/1=year2,NA].
For example, as seen below, if the pattern for Y is 0,0,NA then the X variable is =0 for all the two years prior to the NA. If the pattern for Y is 0,1,NA or 1,0,NA then the X =1 . To be clear, if 1,1,NA then the X=1 that first specific year, it should only count once (X=1), not twice.
The code that I have now (thanks #Auréle, from my previous question here) is the closest that I have to generate it.
dat2 <- dat1 %>%
group_by(country) %>%
group_by(grp = cumsum(is.na(lag(Y))), add = TRUE) %>%
mutate(first_year_at_1 = match(1, Y) * any(is.na(Y)) * any(tail(Y, 3) == 1L),
X = {x <- integer(length(Y)) ; x[first_year_at_1] <- 1L ; x}) %>%
ungroup()
However, it doesn’t really generate what I described above. Any help here would be much appreciated.
Below you can see my sample data with the desired outcome ”X” dummy in it.
data <- structure(list(year = c(1991L, 1992L, 1993L, 1994L, 1995L, 1996L,
1997L, 1998L, 1999L, 2000L, 2001L, 2002L, 2003L, 2004L, 2005L,
2006L, 2007L, 2008L, 2009L, 2010L, 2011L, 1990L, 1991L, 1992L,
1993L, 1994L, 1995L, 1996L, 1997L, 1998L, 1999L, 2000L, 2001L,
2002L, 2003L, 2004L, 2005L, 2006L, 2007L, 2008L, 2009L, 2010L,
2011L, 1990L, 1991L, 1992L, 1993L, 1994L, 1995L, 1996L, 1997L,
1998L, 1999L, 2000L, 2001L, 2002L, 2003L, 2004L, 2005L, 2006L,
2007L, 2008L, 2009L, 2010L, 2011L, 1990L, 1991L, 1992L, 1993L,
1994L, 1995L, 1996L, 1997L, 1998L, 1999L, 2000L, 2001L, 2002L,
2003L, 2004L, 2005L, 2006L, 2007L, 2008L, 2009L, 2010L, 2011L,
1990L, 1991L, 1992L, 1993L, 1994L, 1995L, 1996L, 1997L, 1998L,
1999L, 1999L, 2000L, 2001L, 2002L, 2003L, 2004L, 2005L, 2006L,
2007L, 2008L, 2009L, 2010L, 2011L), country = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), .Label = c("Canada",
"Cuba", "Dominican Republic", "Haiti", "Jamaica"), class = "factor"),
Y = c(1L, NA, 1L, 1L, 1L, NA, 1L, NA, 1L, NA, 1L, NA, 1L,
1L, NA, 1L, NA, 1L, NA, 1L, NA, NA, 1L, 1L, NA, NA, 1L, NA,
1L, NA, 1L, NA, 1L, 1L, 1L, 1L, NA, 1L, NA, 1L, NA, 1L, NA,
NA, 1L, NA, 1L, 0L, 0L, 0L, 1L, NA, 0L, 1L, 0L, 0L, 0L, 0L,
0L, 1L, NA, 0L, 1L, 1L, NA, 0L, 1L, NA, 1L, NA, 1L, NA, 1L,
NA, 1L, NA, 1L, 1L, 1L, 1L, NA, 1L, NA, 1L, NA, 1L, NA, 1L,
0L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, NA, 0L, 1L, 1L, 1L,
NA, 1L, NA, 0L, 1L, 1L, NA), X = c(1L, 0L, 0L, 1L, 0L, 0L,
1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 0L,
0L, 1L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 1L, 0L,
0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L,
1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L,
1L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L)), class = "data.frame", row.names = c(NA,
-110L))
To be honest the question is not 100% clear, but I thought I'd give it a shot, so here it goes:
data_new <- data_1 %>%
mutate(Y_2 = ifelse(is.na(Y), -1, Y)) %>%
group_by(country) %>%
mutate(X_2 = ifelse((Y_2==1 &
lead(Y_2, 1) == -1 &
(lag(Y_2,1)!=1 | is.na(lag(Y_2,1)))) |
(Y_2==1 & lead(Y_2, 2) == -1 ),
1, 0))
basically I formulated the condition as follows:
X is 1 in two cases:
if Y == 1 and Y after two years is NA
or if (Y == 1) and (Y next year is NA) and (Y on the year before is not 1)
A couple of notes:
Since we can't use NAs in comparisons, I used the column Y_2 to replace the NAs with the value -1, and then used it in the comparison
The condition (Y on the year before is not 1) also might cause problems in the first recorded row (year) of each group (country) when Y == 1, which is why I included this case also in the condition (i.e (lag(Y_2,1)!=1 | is.na(lag(Y_2,1))))
Like mentioned in the comment by #andrew_reece, the pattern you're trying to get has a lot of edge cases, only one of which is in the point above, other example might be if Y == 1 in the last couple of years for some country, how would you handle that?
Try considering a more specified description of your conditions based on the data you have
hope this helps
A ggplot2 novice here
I am trying to generate a time series of the sample data given below using ggplot2. The following short code does not give me what I want.
ggplot(dat, aes(x = year,y = data, fill = period,
group = interaction(period, season))) +
geom_line() +
facet_grid(season ~ ., scales = "free")
You can see that the lines appear awkward. How can I plot cu and futogether for each season? Use redcolor for cuand blue for fu.
dat=structure(list(period = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("cu", "fu"), class = "factor"),
season = structure(c(2L, 2L, 2L, 2L, 4L, 4L, 4L, 4L, 1L,
1L, 1L, 1L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 4L, 4L, 4L, 4L,
1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L), .Label = c("DJF", "JJA",
"MAM", "SON"), class = "factor"), month = structure(c(7L,
6L, 2L, 7L, 12L, 11L, 10L, 12L, 3L, 5L, 4L, 3L, 8L, 1L, 9L,
8L, 7L, 6L, 2L, 7L, 12L, 11L, 10L, 12L, 3L, 5L, 4L, 3L, 8L,
1L, 9L, 8L), .Label = c("april", "august", "dec", "feb",
"jan", "july", "june", "march", "may", "nov", "oct", "sep"
), class = "factor"), year = c(2001L, 2001L, 2001L, 2002L,
2001L, 2001L, 2001L, 2002L, 2001L, 2001L, 2001L, 2002L, 2001L,
2001L, 2001L, 2002L, 2001L, 2001L, 2001L, 2002L, 2001L, 2001L,
2001L, 2002L, 2001L, 2001L, 2001L, 2002L, 2001L, 2001L, 2001L,
2002L), data = c(84.08969137, 76.4948428, 18.35492802, 101.8821712,
24.21773903, 16.44881361, 19.57283027, 48.27623315, 8.572824549,
12.97601394, 11.50496081, 15.14899058, 13.96396375, 27.21030149,
36.1606234, 23.35430348, 95.77643784, 94.84972642, 47.26900009,
2.385978093, 21.48062239, 24.67779645, 20.07044416, 43.09234771,
13.28295078, 19.27189857, 15.24661793, 21.75991334, 19.38239851,
39.93109491, 38.54500325, 33.77559647)), .Names = c("period",
"season", "month", "year", "data"), class = "data.frame", row.names = c(NA,
-32L))
Thanks for any suggestions.
I would do something like :
library(ggplot2)
ggplot(dat,aes(x=
as.Date(sprintf("%s-%s-01",year,month),
"%Y-%b-%d"),
y=data,group=period,color=period)) +
geom_line()+facet_grid(season ~ ., scales="free") +
xlab("time")
In fact I am creating a regular date and grouping just by period.
I have a data on countries and want to summarize it and create a table.
> head(data)
country year score members
A 1989 0 7
A 1990 0 7
A 1991 0 7
A 1992 0 7
A 1993 0 7
A 1994 0 7
The table should show the relationship between country "score" and the number of "members" – put differently, I want to see how many states with score 0,1 or 2 have "members"(ranging from 1 to 7).
I want to set it like this:
score members==1 members==2 members==3 members==4 members==5 members==6 members==7
0 1 0
1 2 0
2 0 1 and so on..
To do this I run the following:
library(dplyr)
table <- data %>%
group_by(score) %>%
summarise(
m1 = sum(members==1, na.rm=TRUE),
m2 = sum(members==2, na.rm=TRUE),
m3 = sum(members==3, na.rm=TRUE),
m4 = sum(members==4, na.rm=TRUE),
m5 = sum(members==5, na.rm=TRUE),
m6 = sum(members==6, na.rm=TRUE),
m7 = sum(members==7, na.rm=TRUE)
)
This gives:
score m1 m2 m3 m4 m5 m6 m7
0 0 2 0 0 0 3 30
1 15 3 11 11 3 18 3
2 3 0 2 2 0 6 9
.
.
I need a little help here. As you see it has calculated the total number of observations, whereas I want to count each country only once.
How do I summarize this data to have the total number of countries for each members-level?
Here's a sample of my data for reproducibility:
data <-
structure(list(country = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L), .Label = c("A", "B", "C", "D", "E", "F"), class = "factor"),
year = c(1989L, 1990L, 1991L, 1992L, 1993L, 1994L, 1995L,
1996L, 1997L, 1998L, 1999L, 2000L, 2001L, 2002L, 2003L, 2004L,
2005L, 2006L, 2007L, 2008L, 2010L, 1989L, 1990L, 1991L, 1992L,
1993L, 1994L, 1995L, 1996L, 1997L, 1998L, 1999L, 2000L, 2001L,
2002L, 2003L, 2004L, 2005L, 2006L, 2007L, 2008L, 2009L, 2010L,
2011L, 1989L, 1991L, 1993L, 1994L, 1995L, 1996L, 1997L, 1999L,
2000L, 2001L, 2002L, 2003L, 2004L, 2005L, 2006L, 2007L, 2008L,
2010L, 1989L, 1990L, 1991L, 1992L, 1993L, 1994L, 1995L, 1996L,
1997L, 1998L, 1999L, 2000L, 2001L, 2002L, 2003L, 2004L, 2005L,
2006L, 2007L, 2008L, 2009L, 2010L, 2011L, 1991L, 1992L, 1993L,
1994L, 1995L, 1997L, 1998L, 1999L, 2000L, 2001L, 2002L, 2003L,
2004L, 2005L, 2006L, 2007L, 2008L, 2010L, 1991L, 1992L, 1993L,
1994L, 1995L, 1997L, 1998L, 1999L, 2000L, 2001L, 2002L, 2003L,
2004L, 2005L, 2006L, 2007L, 2008L, 2010L), score = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 0L, 1L, 1L,
1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 2L, 2L,
2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
2L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), members = c(7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L,
4L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L)), .Names = c("country", "year", "score",
"members"), class = "data.frame", row.names = c(NA, -121L))
I believe you need this:
library(reshape2)
dcast(aggregate(country~score+members, data=data, FUN=function(x) length(unique(x))),
score~members, value.var="country", fill=0L)
# score 1 2 3 4 5 6 7
#1 0 0 1 0 0 0 1 2
#2 1 1 1 2 2 1 3 2
#3 2 1 0 1 2 0 1 1
Or, to put it the dplyr/tidyr way:
data %>%
group_by(members, score) %>%
summarise(n=n_distinct(country)) %>%
spread(members, n, fill=0L)
## A tibble: 3 x 8
# score 1 2 3 4 5 6 7
#* <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 0 0 1 0 0 0 1 2
#2 1 1 1 2 2 1 3 2
#3 2 1 0 1 2 0 1 1
As the OP is using dplyr methods, we can do this by grouping with 'score', 'members' to get the number of elements (n()), and then spread (from tidyr) to reshape it to 'wide' format.
library(dplyr)
library(tidyr)
data %>%
group_by(score, members) %>%
summarise(n = n()) %>%
mutate(members = paste0("m", members)) %>%
spread(members, n, fill = 0)
# score m1 m2 m3 m4 m5 m6 m7
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 0 0 2 0 0 0 3 30
#2 1 15 3 11 11 3 18 3
#3 2 3 0 2 2 0 6 9
If we need to also get the counts by 'country', just add 'country' in the group_by
data %>%
group_by(country, score, members) %>%
summarise(n = n()) %>%
mutate(members = paste0("m", members)) %>%
spread(members, n, fill = 0)
If the expected output is the one showed in the other posts, an option using data.table would be to convert the 'data.frame' to 'data.table' (setDT(data), and dcast from 'long' to 'wide' specifying the fun.aggregate as uniqueN of the 'value.var' variable i.e. 'country' where uniqueN returns the length of unique elements in the 'country' column. The fill=0 specifies to occupy 0 for those combinations that are not available. By default, it returns as NA.
library(data.table)
dcast(setDT(data), score~members, value.var= 'country', fun.aggregate = uniqueN, fill = 0)
# score 1 2 3 4 5 6 7
#1: 0 0 1 0 0 0 1 2
#2: 1 1 1 2 2 1 3 2
#3: 2 1 0 1 2 0 1 1
It seems the crux of the issue is having the duplicated rows for each year? In which case you can remove them with distinct, then it's a simple crosstab. You could use the %$% exposition pipe from magrittr:
library(dplyr)
library(magrittr)
data %>%
distinct(country, score, members) %$%
table(score, members)
members
score 1 2 3 4 5 6 7
0 0 1 0 0 0 1 2
1 1 1 2 2 1 3 2
2 1 0 1 2 0 1 1
Or a regular pipe and tabyl from the janitor package:
library(dplyr)
library(janitor)
data %>%
distinct(country, score, members) %>%
tabyl(score, members)
score 1 2 3 4 5 6 7
0 0 1 0 0 0 1 2
1 1 1 2 2 1 3 2
2 1 0 1 2 0 1 1
I need an xy plot which plots means and error bars for x and y with three factors. The three factors are Year (2004-2012), Species (FW, HB), and Region (Kodiak, Shumagin Islands); xmean=mean d13C and ymean=mean 15N.
I can get reasonably close using the following code, but am missing one factor and it's not very aesthetically pleasing. I also get a warning message for exceeding the shape palette.
library(ggplot2)
library(plyr)
GAP_Whales<-structure(list(Species = structure(c(2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 2L), .Label = c("FW", "HB"), class = "factor"), Year = c(2007L,
2007L, 2007L, 2004L, 2004L, 2004L, 2004L, 2004L, 2004L, 2004L,
2004L, 2004L, 2004L, 2004L, 2004L, 2004L, 2004L, 2004L, 2004L,
2004L, 2004L, 2004L, 2004L, 2004L, 2004L, 2004L, 2004L, 2004L,
2004L, 2004L, 2004L, 2004L, 2004L, 2004L, 2004L, 2004L, 2004L,
2004L, 2004L, 2004L, 2004L, 2004L, 2004L, 2004L, 2004L, 2004L,
2004L, 2004L, 2004L, 2004L, 2005L, 2005L, 2005L, 2005L, 2005L,
2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2005L,
2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2005L,
2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2005L,
2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2005L,
2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2005L,
2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2005L,
2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2005L,
2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2006L, 2006L, 2006L,
2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L,
2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L,
2006L, 2006L, 2006L, 2006L, 2007L, 2005L, 2005L, 2005L, 2005L,
2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L,
2001L, 2001L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2008L, 2008L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2008L, 2008L, 2008L, 2008L, 2008L, 2008L,
2008L, 2008L, 2008L, 2008L, 2008L, 2008L, 2008L, 2008L, 2008L,
2008L, 2008L, 2009L, 2009L, 2009L, 2009L, 2009L, 2009L, 2009L,
2009L, 2009L, 2005L, 2005L, 2007L, 2007L, 2007L, 2008L, 2008L,
2008L, 2008L, 2012L, 2012L, 2012L, 2012L, 2012L, 2012L, 2012L,
2012L, 2012L, 2012L, 2012L, 2012L, 2012L, 2010L, 2010L, 2010L,
2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L,
2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L,
2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L,
2010L, 2010L, 2012L, 2012L, 2012L, 2012L, 2012L, 2012L, 2012L,
2012L, 2012L, 2012L, 2012L, 2012L, 2012L, 2012L, 2012L, 2012L,
2012L, 2012L, 2012L, 2012L, 2012L, 2012L, 2012L, 2012L, 2007L
), Region = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = c("Kodiak", "Shumagin Is."), class = "factor"), d13C = c(-17.741,
-17.841, -17.382, -16.955, -17.504, -17.42814286, -15.89, -16.421,
-17.21328571, -17.90142857, -17.654, -19.225, -19.03361128, -18.29057143,
-17.28, -16.897, -18.18585714, -18, -17.619, -17.47014286, -18.382,
-16.807, -18.55242857, -18.527, -17.72557143, -17.06471429, -18.02757143,
-17.599, -17.57614286, -17.36385714, -17.19728571, -18.09871429,
-17.3, -15.928, -17.26071429, -17.85271429, -17.39342857, -16.98,
-16.847, -17.84728571, -16.673, -17.174, -16.277, -17.965, -17.60985714,
-16.6, -17.47885714, -17.46, -17.73342857, -17.028, -18.332,
-18.548, -18.22, -18.035, -17.138, -17.387, -18.314, -18.27,
-17.904, -18.497, -18.264, -18.593, -18.264, -18.008, -17.687,
-18.227, -17.849, -17.713, -18.017, -18.723, -18.793, -17.914,
-18.31, -18.116, -18.65, -17.587, -17.985, -18.793, -17.833,
-17.613, -17.942, -17.86, -17.749, -17.222, -17.286, -17.842,
-18.042, -17.912, -17.858, -18.916, -18.443, -18.638, -18.085,
-17.974, -17.997, -18.387, -18.129, -17.85, -18.699, -18.754,
-18.022, -18.636, -18.197, -18.645, -18.149, -18.157, -18.695,
-18.413, -17.978, -18.447, -17.854, -19.433, -18.251, -17.527,
-17.732, -18.42, -18.089, -17.498, -18.805, -17.677, -17.721,
-18.194, -18.063, -16.987, -18.34342857, -18.46185714, -17.56328571,
-17.84671429, -17.93814286, -18.10157143, -17.786, -17.78442857,
-17.38885714, -16.61228571, -15.97971429, -18.73614286, -18.26371429,
-18.98442857, -17.47014286, -18.12185714, -17.89457143, -18.17728571,
-18.234, -18.83871429, -18.82342857, -18.34314286, -18.43685714,
-18.66757143, -18.6295, -17.553, -17.72555609, -17.42890918,
-18.0937904, -17.3725821, -21.242, -20.107, -19.11, -17.771,
-18.125, -18.577, -17.781, -19.292, -16.776, -20.212, -20.539,
-17.972, -17.986, -18.634, -17.352, -17.409, -17.93, -17.458,
-17.53, -17.321, -17.11, -17.929, -17.244, -17.858, -17.251,
-18.06, -18.22, -18.142, -19.314, -18.412, -17.941, -17.909,
-18.114, -18.783, -18.181, -17.754, -18.484, -17.463, -18.379,
-18.19, -18.227, -17.414, -17.824, -17.436, -17.432, -17.171,
-17.483, -17.64, -17.639, -18.572, -18.545, -18.007, -18.033,
-18.102, -18.301, -17.731, -17.565, -17.68, -17.751, -18.134,
-18.409, -18.336, -18.888, -18.477, -18.25, -18.121, -18.082,
-17.914, -19.337, -19.228, -18.455, -18.657, -18.081, -18.23,
-18.777, -18.935, -18.823, -19.49, -18.383, -18.73, -18.152,
-18.582, -18.653, -18.407, -18.024, -18.994, -17.831, -17.947,
-17.57, -18.142, -17.691, -17.869, -18.513, -18.522, -17.923,
-18.353, -18.278, -17.664, -17.995, -17.786, -18.161, -18.119,
-18.125, -17.098, -17.576, -18.099, -18.713, -17.4, -17.622,
-17.532, -18.007, -18.146, -16.692, -18.678, -19.18, -18.522,
-18.572, -18.476, -19.144, -17.709, -17.742, -18.606, -18.267,
-18.543, -18.301, -19.117, -18.75, -19.394, -19.219, -18.179,
-18.681, -18.835, -18.456, -18.323, -18.148, -18.263, -17.965,
-19.337, -18.301, -19.046, -18.768, -18.017, -17.928, -17.314
), d15N = c(14.166, 14.279, 14.092, 13.464, 13.4, 13.179, 12.895,
13.537, 13.857, 13.775, 14.147, 12.017, 12.531, 12.329, 13.414,
13.777, 12.639, 13.135, 13.833, 13.68, 12.317, 12.237, 11.707,
12.318, 13.574, 14.77, 12.722, 13.772, 13.658, 13.804, 14.07,
15.182, 14.143, 13.54, 12.932, 13.77, 14.332, 12.642, 13.166,
12.412, 12.452, 14.09971429, 13.14, 13.643, 13.393, 13.759, 13.791,
13.244, 12.997, 13.86, 15.53828571, 14.42107143, 14.88228571,
13.32828571, 14.17421429, 12.94985714, 13.21614286, 11.18814286,
12.53371429, 12.67442857, 13.50585714, 12.64092857, 12.83257143,
12.03907143, 12.54642857, 13.70371429, 13.18142857, 14.76085714,
12.74385714, 13.7225, 11.76364286, 13.66457143, 12.65378571,
12.50114286, 14.27671429, 14.10342857, 14.3445, 11.72657143,
12.90221429, 14.71314286, 14.71907143, 14.04371429, 13.75092857,
13.74578571, 14.94164286, 13.07035714, 13.07685714, 12.8775,
13.86664286, 12.87185714, 13.75214286, 13.20285714, 12.46021429,
13.13914286, 13.82028571, 12.52585714, 13.4975, 12.88071429,
12.48042857, 14.29857143, 13.56214286, 13.41, 13.52985714, 13.55592857,
12.80007143, 12.91257143, 13.37457143, 13.60371429, 13.88671429,
13.44635714, 14.18214286, 10.09042857, 12.11571429, 13.00771429,
15.45157143, 13.33135714, 14.58378571, 11.78642857, 12.47628571,
14.46642857, 12.37064286, 13.44335714, 12.39628571, 14.08, 14.0505,
14.34, 14.0145, 13.926, 13.2355, 13.111, 12.3725, 13.888, 13.1075,
14.015, 14.9595, 12.857, 13.277, 12.457, 12.137, 13.124, 13.299,
12.811, 12.231, 11.829, 12.263, 13.036, 13.331, 12.76, 12.262,
14.026, 13.452, 13.769, 13.221, 13.059, 12.754, 12.637, 13.025,
15.123, 14.006, 12.605, 12.636, 14.229, 15.527, 11.583, 13.004,
12.851, 12.921, 12.273, 13.922, 13.429, 12.494, 13.803, 13.55,
13.387, 14.887, 14.248, 14.673, 14.603, 12.879, 12.4, 13.676,
13.648, 13.067, 13.353, 11.703, 14.118, 12.78, 12.293, 12.68,
13.494, 13.309, 13.838, 12.688, 14.418, 14.357, 14.587, 14.714,
14.435, 13.418, 13.013, 12.631, 12.704, 13.091, 12.953, 12.751,
12.409, 12.921, 12.216, 12.594, 12.698, 14.891, 14.692, 13.187,
13.451, 13.023, 11.957, 12.401, 12.527, 13.47, 11.771, 11.848,
12.399, 12.502, 12.678, 12.768, 12.716, 12.671, 12.61, 13.132,
12.999, 13.251, 11.048, 14.384, 12.688, 13.196, 12.875, 13.495,
12.895, 12.992, 12.888, 13.044, 14.195, 13.643, 13.042, 13.15,
13.437, 13.835, 14.884, 13.136, 14.384, 13.927, 14.914, 12.978,
12.841, 13.793, 14.312, 14.219, 14.36, 13.529, 11.837, 13.166,
13.103, 12.798, 13.529, 12.813, 9.574, 13.859, 12.548, 13.405,
12.6, 12.373, 12.964, 12.896, 13.067, 13.896, 14.533, 14.024,
13.042, 13.213, 13.857, 12.857, 12.393, 11.841, 13.702, 13.634,
14.391, 13.719, 13.181, 13.566, 13.314, 13.457, 12.871, 12.383,
13.62, 13.753, 13.388, 12.856, 14.408)), .Names = c("Species",
"Year", "Region", "d13C", "d15N"), class = "data.frame", row.names = c(NA,
-298L))
means <- ddply(GAP_Whales, .(Species, Year, Region), function(x) c(xmean=mean(x$d13C), xsd=sd(x$d13C), ymean=mean(x$d15N), ysd=sd(x$d15N)))
Species<-as.factor(means$Species)
Region<-as.factor(means$Region)
Year<-as.factor(means$Year)
p<-ggplot(means, aes(x=means$xmean, y=means$ymean))
p<-p+geom_point(aes(shape=factor(Year), color=factor(Region)))
p<-p + geom_errorbar(aes(ymin=ymean-ysd, ymax=ymean+ysd), width=.1)+
geom_errorbarh(aes(xmin=xmean-xsd, xmax=xmean+xsd), width=.1)
p