When creating and plotting a table the names are numeric values and I would like for them to stay in numeric order.
Code :
library(plyr)
set.seed(1234)
# create a random vector of different categories
number_of_categories <- 11
probability_of_each_category <- c(0.1,0.05, 0.05,0.08, 0.01,
0.1, 0.2, 0.3, 0.01, 0.02,0.08)
number_of_samples <- 1000
x <- sample( LETTERS[1:number_of_categories],
number_of_samples,
replace=TRUE,
prob=probability_of_each_category)
# just a vector of zeros and ones
outcome <- rbinom(number_of_samples, 1, 0.4)
# I want x to be 1,2,...,11 so that it demonstrates the issue when
# creating the table
x <- mapvalues(x,
c(LETTERS[1:number_of_categories]),
seq(1:number_of_categories))
# the table shows the ordering
prop.table(table(x))
plot(table(x, outcome))
Table :
> prop.table(table(x))
x
1 10 11 2 3 4 5 6 7 8 9
0.105 0.023 0.078 0.044 0.069 0.083 0.018 0.097 0.195 0.281 0.007
Plot :
I would like the plot and the table in the order
1 3 4 5 ... 10 11
Rather than
1 10 11 2 3 4 5 6 7 8 9
You can either convert x to numeric before feeding it to table
plot(table(as.numeric(x), outcome))
Or order the table's rows by the as.numeric of the rownames
t <- table(x, outcome)
t <- t[order(as.numeric(rownames(t))),]
plot(t)
A simple to solve this problem is to format the numbers to include a leading zero during mapvalues(), using sprintf().
x <- mapvalues(x,
c(LETTERS[1:number_of_categories]),
sprintf("%02d",seq(1:number_of_categories)))
# the table shows the ordering
prop.table(table(x))
plot(table(x, outcome))
...and the output:
> prop.table(table(x))
x
01 02 03 04 05 06 07 08 09 10 11
0.104 0.067 0.038 0.073 0.019 0.112 0.191 0.291 0.011 0.019 0.075
I am trying write code that will do autocorrelation for multiple subsets. For example. I have health data for multiple countries over time. I want to get each country's autocorrelation for each variable. Any help would be great!
Here are some things I have tried, unsuccessfully:
require(plyr)
POP_ACF=acf(PhD_data_list_view$POP, lag.max=NULL, type=c("correlation"),
plot=TRUE, na.action=na.pass, demean=TRUE)
dlply(PhD_data_list_view, .(Country), function(x) POP_ACF %+% x)
POP_ACF=function(PhD_data_list_view$POP) c(acf(PhD_data_list_view$POP, plot=TRUE)$acf)
acf is a function takes a vector and returns a list. That makes it a natural fit for the purrr package, which maps functions over lists, but it can also be done using base R.
I'll use the beaver1 dataset from the datasets package since you didn't provide yours. I'll use different days of observations as the analogue to your different countries, and temperature for your POP variable.
Base R:
split turns the vector beaver1$temp into a list of vectors along the second argument, beaver1$day.
Then mapply runs the function acf on each element of that list.
Since we're using mapply instead of lapply, we can also provide another list of arguments, here the titles for each plot, main = unique(beaver1$day).
The last argument, SIMPLIFY = F, tells it to return the default output, not attempt to coerce the list into anything else.
par(mfrow = c(1,2))
mapply(acf,
split(beaver1$temp, beaver1$day),
main = unique(beaver1$day),
SIMPLIFY = F)
# $`346`
#
# Autocorrelations of series ‘dots[[1L]][[1L]]’, by lag
#
# 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
# 1.000 0.838 0.698 0.593 0.468 0.355 0.265 0.167 0.113 0.069 0.028 0.037 0.087 0.108 0.145 0.177 0.151 0.125 0.123 0.106
# $`347`
#
# Autocorrelations of series ‘dots[[1L]][[2L]]’, by lag
#
# 0 1 2 3 4 5 6 7 8 9 10 11 12 13
# 1.000 0.546 0.335 0.130 0.080 0.024 -0.025 -0.103 -0.090 -0.032 0.168 0.036 -0.089 -0.306
purrr and the tidy way:
This way is a bit more flexible depending what you want to do with the output. We can use purrr::map as a direct drop-in for mapply:
library(purrr)
beaver1 %>%
split(.$day) %>%
map(~acf(.$temp, main = unique(.$day)))
Which returns the exact same output. But we can also go fully tidy and return the data from acf as a dataframe so that we can explore it further with ggplot2.
The first map is returning a list of outputs, each of which is a list containing, among other things, variables lag, acf, and n.used.
The map_dfr is running the function data.frame, assigning each of those variables to a new column.
We also make a column to calculate the CIs. Refer to: How is the confidence interval calculated for the ACF function?
Then we can use ggplot to make any kind of plot we want, and we still have the data for any other analysis you want to do.
library(ggplot2)
beaver_acf <-
beaver1 %>%
split(.$day) %>%
map(~acf(.$temp, plot = F)) %>%
map_dfr(
~data.frame(lag = .$lag,
acf = .$acf,
ci = qnorm(0.975)/sqrt(.$n.used)
), .id = "day")
head(beaver_acf)
# day lag acf ci
# 1 346 0 1.0000000 0.2054601
# 2 346 1 0.8378889 0.2054601
# 3 346 2 0.6983476 0.2054601
# 4 346 3 0.5928198 0.2054601
# 5 346 4 0.4680912 0.2054601
# 6 346 5 0.3554939 0.2054601
ggplot(beaver_acf, aes(lag, acf)) +
geom_segment(aes(xend = lag, yend = 0)) +
geom_hline(aes(yintercept = ci), linetype = "dashed", color = "blue") +
geom_hline(aes(yintercept = -ci), linetype = "dashed", color = "blue") +
facet_wrap(~variable)
I think this is a split-apply-combine problem, but with a time series twist. My data consists of irregular counts and I need to perform some summary statistics on each group of counts. Here's a snapshot of the data:
And here's it is for your console:
library(xts)
date <- as.Date(c("2010-11-18", "2010-11-19", "2010-11-26", "2010-12-03", "2010-12-10",
"2010-12-17", "2010-12-24", "2010-12-31", "2011-01-07", "2011-01-14",
"2011-01-21", "2011-01-28", "2011-02-04", "2011-02-11", "2011-02-18",
"2011-02-25", "2011-03-04", "2011-03-11", "2011-03-18", "2011-03-25",
"2011-03-26", "2011-03-27"))
returns <- c(0.002,0.000,-0.009,0.030, 0.013,0.003,0.010,0.001,0.011,0.017,
-0.008,-0.005,0.027,0.014,0.010,-0.017,0.001,-0.013,0.027,-0.019,
0.000,0.001)
count <- c(NA,NA,1,1,2,2,3,4,5,6,7,7,7,7,7,NA,NA,NA,1,2,NA,NA)
maxCount <- c(NA,NA,0.030,0.030,0.030,0.030,0.030,0.030,0.030,0.030,0.030,
0.030,0.030,0.030,0.030,NA,NA,NA,0.027,0.027,NA,NA)
sumCount <- c(NA,NA,0.000,0.030,0.042,0.045,0.056,0.056,0.067,0.084,0.077,
0.071,0.098,0.112,0.123,NA,NA,NA,0.000,-0.019,NA,NA)
xtsData <- xts(cbind(returns,count,maxCount,sumCount),date)
I have no idea how to construct the max and cumSum columns, especially since each count series is of an irregular length. Since I won't always know the start and end points of a count series, I'm lost at trying to figure out the index of these groups. Thanks for your help!
UPDATE: here is my for loop for attempting to calculating cumSum. it's not the cumulative sum, just the returns necessary, i'm still unsure how to apply functions to these ranges!
xtsData <- cbind(xtsData,mySumCount=NA)
# find groups of returns
for(i in 1:nrow(xtsData)){
if(is.na(xtsData[i,"count"]) == FALSE){
xtsData[i,"mySumCount"] <- xtsData[i,"returns"]
}
else{
xtsData[i,"mySumCount"] <- NA
}
}
UPDATE 2: thank you commenters!
# report returns when not NA count
x1 <- xtsData[!is.na(xtsData$count),"returns"]
# cum sum is close, but still need to exclude the first element
# -0.009 in the first series of counts and .027 in the second series of counts
x2 <- cumsum(xtsData[!is.na(xtsData$count),"returns"])
# this is output is not accurate because .03 is being displayed down the entire column, not just during periods when counts != NA. is this just a rounding error?
x3 <- max(xtsData[!is.na(xtsData$count),"returns"])
SOLUTION:
# function to pad a vector with a 0
lagpad <- function(x, k) {
c(rep(0, k), x)[1 : length(x)]
}
# group the counts
x1 <- na.omit(transform(xtsData, g = cumsum(c(0, diff(!is.na(count)) == 1))))
# cumulative sum of the count series
z1 <- transform(x1, cumsumRet = ave(returns, g, FUN =function(x) cumsum(replace(x, 1, 0))))
# max of the count series
z2 <- transform(x1, maxRet = ave(returns, g, FUN =function(x) max(lagpad(x,1))))
merge(xtsData,z1$cumsumRet,z2$maxRet)
The code shown is not consistent with the output in the image and there is no explanation provided so its not clear what manipulations were wanted; however, the question did mention that the main problem is distinguishing the groups so we will address that.
To do that we compute a new column g whose rows contain 1 for the first group, 2 for the second and so on. We also remove the NA rows since the g column is sufficient to distinguish groups.
The following code computes a vector the same length as count by first setting each NA position to FALSE and each non-NA position to TRUE. It then differences each position of that vector with the prior position. To do that it implicitly converts FALSE to 0 and TRUE to 1 and then performs the differencing. Next we convert this last result to a logical vector which is TRUE for each 1 component and FALSE otherwise. Since the first component of the vector that is differenced has no prior position we prepend 0 for that. The prepending operation implicitly converts the TRUE and FALSE values just generated to 1 and 0 respectively. Taking the cumsum fills in the first group with 1, the second with 2 and so on. Finally omit the NA rows:
x <- na.omit(transform(x, g = cumsum(c(0, diff(!is.na(count)) == 1))))
giving:
> x
returns count maxCount sumCount g
2010-11-26 -0.009 1 0.030 0.000 1
2010-12-03 0.030 1 0.030 0.030 1
2010-12-10 0.013 2 0.030 0.042 1
2010-12-17 0.003 2 0.030 0.045 1
2010-12-24 0.010 3 0.030 0.056 1
2010-12-31 0.001 4 0.030 0.056 1
2011-01-07 0.011 5 0.030 0.067 1
2011-01-14 0.017 6 0.030 0.084 1
2011-01-21 -0.008 7 0.030 0.077 1
2011-01-28 -0.005 7 0.030 0.071 1
2011-02-04 0.027 7 0.030 0.098 1
2011-02-11 0.014 7 0.030 0.112 1
2011-02-18 0.010 7 0.030 0.123 1
2011-03-18 0.027 1 0.027 0.000 2
2011-03-25 -0.019 2 0.027 -0.019 2
attr(,"na.action")
2010-11-18 2010-11-19 2011-02-25 2011-03-04 2011-03-11 2011-03-26 2011-03-27
1 2 16 17 18 21 22
attr(,"class")
[1] "omit"
You can now use ave to perform any calculations you like. For example to take cumulative sums of returns by group:
transform(x, cumsumRet = ave(returns, g, FUN = cumsum))
Replace cumsum with any other function that is suitable for use with ave.
Ah, so "count" are the groups and you want the cumsum per group and the max per group. I think in data.table, so here is how I would do it.
library(xts)
library(data.table)
date <- as.Date(c("2010-11-18", "2010-11-19", "2010-11-26", "2010-12-03", "2010-12-10",
"2010-12-17", "2010-12-24", "2010-12-31", "2011-01-07", "2011-01-14",
"2011-01-21", "2011-01-28", "2011-02-04", "2011-02-11", "2011-02-18",
"2011-02-25", "2011-03-04", "2011-03-11", "2011-03-18", "2011-03-25",
"2011-03-26", "2011-03-27"))
returns <- c(0.002,0.000,-0.009,0.030, 0.013,0.003,0.010,0.001,0.011,0.017,
-0.008,-0.005,0.027,0.014,0.010,-0.017,0.001,-0.013,0.027,-0.019,
0.000,0.001)
count <- c(NA,NA,1,1,2,2,3,4,5,6,7,7,7,7,7,NA,NA,NA,1,2,NA,NA)
maxCount <- c(NA,NA,0.030,0.030,0.030,0.030,0.030,0.030,0.030,0.030,0.030,
0.030,0.030,0.030,0.030,NA,NA,NA,0.027,0.027,NA,NA)
sumCount <- c(NA,NA,0.000,0.030,0.042,0.045,0.056,0.056,0.067,0.084,0.077,
0.071,0.098,0.112,0.123,NA,NA,NA,0.000,-0.019,NA,NA)
DT<-data.table(date,returns,count)]
DT[!is.na(count),max:=max(returns),by=count]
DT[!is.na(count),cumSum:= cumsum(returns),by=count]
#if you need an xts object at the end, then.
xtsData <- xts(cbind(DT$returns,DT$count, DT$max,DT$cumSum),DT$date)