Plot aggregate with multiple columns and multiple variables - r

Attempting to plot aggregate data from the following data.
Person Time Period Value SMA2 SMA3 SMA4
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 1 1 14 NA NA NA
2 A 2 1 8 11 NA NA
3 A 3 1 13 10.5 11.7 NA
4 A 4 1 12 12.5 11 11.8
5 A 5 1 19 15.5 14.7 13
6 A 6 1 9 14 13.3 13.2
7 A 7 2 14 NA NA NA
8 A 8 2 7 10.5 NA NA
9 A 9 2 11 9 10.7 NA
10 A 10 2 14 12.5 10.7 11.5
# ... with 26 more rows
I have used aggregate(DataSet[,c(4,5,6,7)], by=list(DataSet$Person), na.rm = TRUE, max) to get the following:
Group.1 Value SMA2 SMA3 SMA4
1 A 20 18.0 16.66667 15.25
2 B 20 17.0 16.66667 15.00
3 C 19 18.5 14.33333 14.50
I'd like to plot the maxes for each SMA for Person A, B, and C on the same plot.
I would also like to be able to plot the mean of these maxes for each SMA column.
Any help is appreciated.

Like so? Or are you looking for something different?
df <- data.frame("Group.1"=c("A","B","C"), "Value"=c(20,20,20),
"SMA2"=c(18.0, 17.0, 18.5), "SMA3" =c(16.667, 16.667, 14.333),
"SMA4"=c(15.25, 15.00, 14.50))
library(ggplot2)
library(tidyr)
df.g <- df %>%
gather(SMA, Value, -Group.1)
df.g$SMA <- factor(df.g$SMA, levels=c("Value", "SMA2", "SMA3", "SMA4"))
means <- df.g %>%
group_by(SMA) %>%
summarise(m=mean(Value))
ggplot(df.g, aes(x=SMA, y=Value, group=Group.1, colour=Group.1)) +
geom_line() +
geom_point(data=means, aes(x=SMA, y=m), inherit.aes = F)

Related

MCMCglmm data format difficulties

I want to estimate the heritability of animal traits using an animal model. However, I can't figure out how to properly format my data so that MCMCglmm can create a model. After much trial and error and scouring the internet for advice, I'm stuck. To my knowledge, I've formatted the data as suggested by all available resources that I know of, yet I get the following error message::
Error in MCMCglmm(BWT ~ 1, random = ~animal, pedigree = Ped, data = Data, :
some levels of animal do not have a row entry in ginverse
My questions are: What is the ginverse, exactly, and why doesn't it have row entries for all levels of animal?
Here are my two (dummy) data sets:
Animal phenotype data:
> Data
# A tibble: 10 x 6
ANIMAL MOTHER BYEAR SEX BWT TARSUS
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 11 968 1 10.8 24.8
2 2 11 968 1 9.3 22.5
3 3 12 970 2 3.98 12.9
4 4 12 970 1 5.39 20.5
5 5 13 970 2 12.1 NA
6 6 13 970 1 NA NA
7 7 14 971 2 NA NA
8 8 15 971 1 7.63 14.2
9 9 16 971 1 4.76 NA
10 10 17 971 1 NA NA
names(Data)[1] <- "animal"
Data$animal<-as.factor(Data$animal)
Data$MOTHER<-as.factor(Data$MOTHER)
Data$BYEAR<-as.factor(Data$BYEAR)
Data$SEX<-as.factor(Data$SEX)
Data$BWT<-as.numeric(Data$BWT)
Data$TARSUS<-as.numeric(Data$TARSUS)
Pedigree data:
> Ped
# A tibble: 17 x 3
ID MOTHER FATHER
<dbl> <dbl> <dbl>
1 1 11 18
2 2 11 NA
3 3 12 NA
4 4 12 19
5 5 13 20
6 6 13 NA
7 7 14 NA
8 8 15 21
9 9 16 22
10 10 17 23
11 11 NA NA
12 12 NA NA
13 13 NA NA
14 14 NA NA
15 15 NA NA
16 16 NA NA
17 17 NA NA
Load required packages. Need to format Ped as a matrix, then use the insertPed() and orderPed() functions so that parents appear before offspring in the ID field:
library(MCMCglmm)
library(MasterBayes)
Ped <- as.matrix(Ped)
Ped <- insertPed(Ped)
Ped <- orderPed(Ped)
Reformat to data.frame
Ped <- as.data.frame(Ped)
Load the package, estimate variance and priors, and generate model:
p.var <- var(Data$BWT , na.rm=TRUE)
prior1.1 <- list(G=list(G1=list(V=matrix(p.var/2),n=1)),
R=list(V=matrix(p.var/2),n=1))
model1.1 <- MCMCglmm(BWT ~ 1 , random = ~animal, pedigree = Ped, data = Data, prior = prior1.1)
Problem solved!
The Error message was getting thrown because the data set Data also needed to be formatted Data <- as.data.frame(Data)
Thanks to Jarrod Hadfield for the simple yet evasive solution.

find first occurrence in two variables in df

I need to find the first two times my df meets a certain condition grouped by two variables. I am trying to use the ddply function, but I am doing something wrong with the ".variables" command.
So in this example, I'm trying to find the first two times x > 30 and y > 30 in each group / trial.
The way I'm using ddply is giving me the first two times in the dataset, then repeating that for every group.
set.seed(1)
df <- data.frame((matrix(nrow=200,ncol=5)))
colnames(df) <- c("group","trial","x","y","hour")
df$group <- rep(c("A","B","C","D"),each=50)
df$trial <- rep(c(rep(1,times=25),rep(2,times=25)),times=4)
df[,3:4] <- runif(400,0,50)
df$hour <- rep(1:25,time=8)
library(plyr)
ddply(.data=df, .variables=c("group","trial"), .fun=function(x) {
i <- which(df$x > 30 & df$y >30 )[1:2]
if (!is.na(i)) x[i, ]
})
Expected results:
group trial x y hour
13 A 1 34.3511423 38.161134 13
15 A 1 38.4920710 40.931734 15
36 A 2 33.4233369 34.481392 11
37 A 2 39.7119930 34.470671 12
52 B 1 43.0604738 46.645491 2
65 B 1 32.5435234 35.123126 15
But instead, my code is finding c(1,4) from the first grouptrial and repeating that over for every grouptrial:
group trial x y hour
1 A 1 34.351142 38.161134 13
2 A 1 38.492071 40.931734 15
3 A 2 5.397181 27.745031 13
4 A 2 20.563721 22.636003 15
5 B 1 22.953286 13.898301 13
6 B 1 32.543523 35.123126 15
I would also like for there to be rows of NA if a second occurrence isn't present in a group*trial.
Thanks,
I think this is what you want:
library(tidyverse)
df %>% group_by(group, trial) %>% filter(x > 30 & y > 30) %>% slice(1:2)
Result:
# A tibble: 16 x 5
# Groups: group, trial [8]
group trial x y hour
<chr> <dbl> <dbl> <dbl> <int>
1 A 1 33.5 46.3 4
2 A 1 32.6 42.7 11
3 A 2 35.9 43.6 4
4 A 2 30.5 42.7 14
5 B 1 33.0 38.1 2
6 B 1 40.5 30.4 7
7 B 2 48.6 33.2 2
8 B 2 34.1 30.9 4
9 C 1 33.0 45.1 1
10 C 1 30.3 36.7 17
11 C 2 44.8 33.9 1
12 C 2 41.5 35.6 6
13 D 1 44.2 34.3 12
14 D 1 39.1 40.0 23
15 D 2 39.4 47.5 4
16 D 2 42.1 40.1 10
(slightly different from your results, probably a different R version)
I reccomend using dplyr or data.table rather than plyr. From the plyr github page:
plyr is retired: this means only changes necessary to keep it on CRAN
will be made. We recommend using dplyr (for data frames) or purrr (for
lists) instead.
Since someone has already provided a solution with dplyr, here is one option with data.table.
In the selection df[i, j, k] I am selecting rows which match your criteria in i, grouping by the given variables in k, and selecting the first two rows (head) of each group-specific subset of the data .SD. All of this inside the brackets is data.table specific, and only works because I converted df to a data.table first with setDT.
library(data.table)
setDT(df)
df[x > 30 & y > 30, head(.SD, 2), by = .(group, trial)]
# group trial x y hour
# 1: A 1 34.35114 38.16113 13
# 2: A 1 38.49207 40.93173 15
# 3: A 2 33.42334 34.48139 11
# 4: A 2 39.71199 34.47067 12
# 5: B 1 43.06047 46.64549 2
# 6: B 1 32.54352 35.12313 15
# 7: B 2 48.03090 38.53685 5
# 8: B 2 32.11441 49.07817 18
# 9: C 1 32.73620 33.68561 1
# 10: C 1 32.00505 31.23571 20
# 11: C 2 32.13977 40.60658 9
# 12: C 2 34.13940 49.47499 16
# 13: D 1 36.18630 34.94123 19
# 14: D 1 42.80658 46.42416 23
# 15: D 2 37.05393 43.24038 3
# 16: D 2 44.32255 32.80812 8
To try a solution that is closer to what you've tried so far we can do the following
ddply(.data=df, .variables=c("group","trial"), .fun=function(df_temp) {
i <- which(df_temp$x > 30 & df_temp$y >30 )[1:2]
df_temp[i, ]
})
Some explanation
One problem with the code that you provided is that you used df inside of ddply. So you defined fun= function(x) but you didn't look for cases of x> 30 & y> 30 in x but in df. Further, your code uses i for x, but i was defined with df. Finally, to my understanding there is no need for if (!is.na(i)) x[i, ]. If there is only one row that meets your condition, you will get a row with NAs anayway, because you use which(df_temp$x > 30 & df_temp$y >30 )[1:2].
Using dplyr, you can also do:
df %>%
group_by(group, trial) %>%
slice(which(x > 30 & y > 30)[1:2])
group trial x y hour
<chr> <dbl> <dbl> <dbl> <int>
1 A 1 34.4 38.2 13
2 A 1 38.5 40.9 15
3 A 2 33.4 34.5 11
4 A 2 39.7 34.5 12
5 B 1 43.1 46.6 2
6 B 1 32.5 35.1 15
7 B 2 48.0 38.5 5
8 B 2 32.1 49.1 18
Since everything else is covered here is a base R version using split
output <- do.call(rbind, lapply(split(df, list(df$group, df$trial)),
function(new_df) new_df[with(new_df, head(which(x > 30 & y > 30), 2)), ]
))
rownames(output) <- NULL
output
# group trial x y hour
#1 A 1 34.351 38.161 13
#2 A 1 38.492 40.932 15
#3 B 1 43.060 46.645 2
#4 B 1 32.544 35.123 15
#5 C 1 32.736 33.686 1
#6 C 1 32.005 31.236 20
#7 D 1 36.186 34.941 19
#8 D 1 42.807 46.424 23
#9 A 2 33.423 34.481 11
#10 A 2 39.712 34.471 12
#11 B 2 48.031 38.537 5
#12 B 2 32.114 49.078 18
#13 C 2 32.140 40.607 9
#14 C 2 34.139 49.475 16
#15 D 2 37.054 43.240 3
#16 D 2 44.323 32.808 8

Counting the number of appearances of a certain value in a column

I have a column in my data frame which looks like this:
> df
# A tibble: 20 x 1
duration
<dbl>
1 0
2 40.0
3 247.
4 11.8
5 116.
6 10.2
7 171.
8 7.58
9 87.8
10 23.2
11 390.
12 35.8
13 4.73
14 29.1
15 0
16 36.8
17 73.8
18 12.9
19 124.
20 10.7
I need to group this data, so that all rows starting from a 0 to the last row before the next zero are in a group. I've accomplished this using a for-loop:
counter <- 0
df$group <- NA
df$group[1] <- 1
for (i in 2:NROW(df)) {
df$group[i] <-
ifelse(df$duration[i] == 0, df$group[i - 1] + 1, df$group[i - 1])
}
which gives me the desired output:
> df
# A tibble: 20 x 2
duration group
<dbl> <dbl>
1 0 1
2 40.0 1
3 247. 1
4 11.8 1
5 116. 1
6 10.2 1
7 171. 1
8 7.58 1
9 87.8 1
10 23.2 1
11 390. 1
12 35.8 1
13 4.73 1
14 29.1 1
15 0 2
16 36.8 2
17 73.8 2
18 12.9 2
19 124. 2
20 10.7 2
But as my original dataframe is quite big i'm looking for a faster solution, and I've been trying to get it working with dplyr but to no avail. Other related questions are counting how often the current value has already appeared, not a specific one so I haven't found a solution to this problem yet.
I'd appreaciate your help in finding a vectorized solution for my problem, thanks! Heres the example-data:
df <-
structure(
list(
duration = c(
0,
40.0009999275208,
247.248000144958,
11.8349997997284,
115.614000082016,
10.2449998855591,
171.426000118256,
7.58200001716614,
87.805999994278,
23.1909999847412,
390.417999982834,
35.8229999542236,
4.73100018501282,
29.0869998931885,
0,
36.789999961853,
73.8420000076294,
12.8770000934601,
123.771999835968,
10.7190001010895
)
),
row.names = c(NA,-20L),
class = c("tbl_df", "tbl", "data.frame")
)
We can create the desired column using cumsum as below
df %>%
mutate(grp = cumsum(duration == 0))
# A tibble: 20 x 2
# duration grp
# <dbl> <int>
# 1 0 1
# 2 40.0 1
# 3 247. 1
# 4 11.8 1
# 5 116. 1
# 6 10.2 1
# 7 171. 1
# 8 7.58 1
# 9 87.8 1
#10 23.2 1
#11 390. 1
#12 35.8 1
#13 4.73 1
#14 29.1 1
#15 0 2
#16 36.8 2
#17 73.8 2
#18 12.9 2
#19 124. 2
#20 10.7 2

Run function in R for mean

Based on my values i need a function to get following results.
enter image description here
The functions has to calculate the mean of current value and the 3 previous values.
The function should be flexible in that way, that the same calculation can be applied for 2, 4, 5 or x previous values, for example: mean of current value and the 2 previous values.
please consider, that my daten has random numbers, and not like in above example ascending numbers
What you need is a rolling mean, in the argument k (4 in my example) you provide an integer width of the rolling window. Check the documentation page for the rollmean function of the zoo package, ?rollmean.
zoo
library(zoo)
library(dplyr)
df <- data.frame(number = 1:20)
df %>% mutate(rolling_avg = rollmean(number, k = 4 , fill = NA, align = "right"))
RcppRoll
library(RcppRoll)
df %>% mutate(rolling_avg = roll_mean(number, n = 4, fill = NA, align = "right"))
Output
number rolling_avg
1 1 NA
2 2 NA
3 3 NA
4 4 2.5
5 5 3.5
6 6 4.5
7 7 5.5
8 8 6.5
9 9 7.5
10 10 8.5
11 11 9.5
12 12 10.5
13 13 11.5
14 14 12.5
15 15 13.5
16 16 14.5
17 17 15.5
18 18 16.5
19 19 17.5
20 20 18.5
Using the other vector you provided in the comments:
df <- data.frame(number = c(1,-3,5,4,3,2,-4,5,6,-4,3,2,3,-4,5,6,6,3,2))
df %>% mutate(rolling_avg = rollmean(number, 4, fill = NA, align = "right"))
Output
number rolling_avg
1 1 NA
2 -3 NA
3 5 NA
4 4 1.75
5 3 2.25
6 2 3.50
7 -4 1.25
8 5 1.50
9 6 2.25
10 -4 0.75
11 3 2.50
12 2 1.75
13 3 1.00
14 -4 1.00
15 5 1.50
16 6 2.50
17 6 3.25
18 3 5.00
19 2 4.25
You can also use the rollify function in the tibbletime package to create a custom rolling function for any function. For mean it would look like this (using data from #mpalanco's answer):
library(dplyr)
library(tibbletime)
rolling_mean <- rollify(mean, window = 4)
df %>% mutate(moving_average = rolling_mean(number))
which gives you:
number moving_average
1 1 NA
2 2 NA
3 3 NA
4 4 2.5
5 5 3.5
6 6 4.5
7 7 5.5
8 8 6.5
9 9 7.5
10 10 8.5
11 11 9.5
12 12 10.5
13 13 11.5
14 14 12.5
15 15 13.5
16 16 14.5
17 17 15.5
18 18 16.5
19 19 17.5
20 20 18.5
The benefit of this approach is that it is easy to extend to things other than rolling average.

Calculating cumulative mean using floating conditions

My dataset has as features: players IDs, team, weeks and points.
I want to calculate the mean of TEAM points for previous weeks, but not all past weeks, just to the last 5 or less (if the current week is smaller than 5).
Example: For team = A, week = 7, the result will be the average of POINTS for team = A and weeks 2, 3, 4, 5 and 6.
The dataset can be created using the following code:
# set the seed for reproducibility
set.seed(123)
player_id<-c(rep(1,15),rep(2,15),rep(3,15),rep(4,15))
week<-1:15
team<-c(rep("A",30),rep("B",30))
points<-round(runif(60,1,10),0)
mydata<- data.frame(player_id=player_id,team=team,week=rep(week,4),points)
I would like to have a solution without a heavy looping, because the dataset is huge.
I have done related questions here that maybe will help, but I could not adapt to this case.
Question 1
Question 2
Thank you!
We adapt the approach from my answer to one of your other questions if you want a dplyr solution:
library(dplyr)
library(zoo)
# set the seed for reproducibility
set.seed(123)
player_id<-c(rep(1,15),rep(2,15),rep(3,15),rep(4,15))
week<-1:15
team<-c(rep("A",30),rep("B",30))
points<-round(runif(60,1,10),0)
mydata<- data.frame(player_id=player_id,team=team,week=rep(week,4),points)
roll_mean <- function(x, k) {
result <- rollapplyr(x, k, mean, partial=TRUE, na.rm=TRUE)
result[is.nan(result)] <- NA
return( result )
}
It might first be easier to aggregate by team:
team_data <- mydata %>%
select(-player_id) %>%
group_by(team, week) %>%
arrange(week) %>%
summarise(team_points = sum(points)) %>%
mutate(rolling_team_mean = roll_mean(lag(team_points), k=5)) %>%
arrange(team)
team_data
# A tibble: 30 x 4
# Groups: team [2]
team week team_points rolling_team_mean
<fctr> <int> <dbl> <dbl>
1 A 1 13 NA
2 A 2 11 13.00
3 A 3 6 12.00
4 A 4 13 10.00
5 A 5 19 10.75
6 A 6 10 12.40
7 A 7 13 11.80
8 A 8 16 12.20
9 A 9 16 14.20
10 A 10 12 14.80
# ... with 20 more rows
Then, if you like we can put everything back together:
mydata <- inner_join(mydata, team_data) %>%
arrange(week, team, player_id)
mydata[1:12, ]
player_id team week points team_points rolling_team_mean
1 1 A 1 4 13 NA
2 2 A 1 9 13 NA
3 3 B 1 10 12 NA
4 4 B 1 2 12 NA
5 1 A 2 8 11 13
6 2 A 2 3 11 13
7 3 B 2 9 12 12
8 4 B 2 3 12 12
9 1 A 3 5 6 12
10 2 A 3 1 6 12
11 3 B 3 7 12 12
12 4 B 3 5 12 12
Here's one way:
# compute points per team per week
pts <- with(mydata, tapply(points, list(team, week), sum, default = 0))
pts
# 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
#A 13 11 6 13 19 10 13 16 16 12 17 11 13 10 4
#B 12 12 12 11 10 6 13 11 6 9 5 7 13 13 6
# compute the 5-week averages
sapply(setNames(seq(2, ncol(pts)), seq(2, ncol(pts))),
function(i) {
apply(pts[, seq(max(1, i - 5), i - 1), drop = FALSE], 1, mean)
})
# 2 3 4 5 6 7 8 9 10 11 12 13 14 15
#A 13 12 10 10.75 12.4 11.8 12.2 14.2 14.8 13.4 14.8 14.4 13.8 12.6
#B 12 12 12 11.75 11.4 10.2 10.4 10.2 9.2 9.0 8.8 7.6 8.0 9.4
This will give the wrong result if the week variable has gaps.

Resources