ggvis barchart using dates as x axis - r

I have been switching over from ggplot to ggvis when working with shiny apps. I have figured out a lot but am really stumped when it comes to bar graphs. I have a timeseries with dates and values. I simply want bars instead of points for each value (I would ideally like to be able to plot multiple semi-transparent bars if anyone has had success there please share) but here I wanted to get one bar working.
Works with layer_points()
df %>% ggvis(~date, ~x) %>% layer_points() %>% scale_datetime("x")
Doesnt work with layer_bars()
df %>% ggvis(~date, ~x) %>% layer_bars() %>% scale_datetime("x")
Data I am using...
structure(list(date = structure(c(7680, 7687, 7694, 7701, 7708,
7715, 7722, 7729, 7736, 7743, 7750, 7757, 7764, 7771, 7778, 7785,
7792, 7799, 7806, 7813, 7820, 7827, 7834, 7841, 7848, 7855, 7862,
7869, 7876, 7883, 7890, 7897, 7904, 7911, 7918, 7925, 7932, 7939,
7946, 7953, 7960, 7967, 7974, 7981, 7988, 7995, 8002, 8009, 8016,
8023, 8030, 8037, 8044, 8051, 8058, 8065, 8072, 8079, 8086, 8093,
8100, 8107, 8114, 8121, 8128, 8135, 8142, 8149, 8156, 8163, 8170,
8177, 8184, 8191, 8198, 8205, 8212, 8219, 8226, 8233, 8240, 8247,
8254, 8261, 8268, 8275, 8282, 8289, 8296, 8303, 8310, 8317, 8324,
8331, 8338, 8345, 8352, 8359, 8366, 8373, 8380, 8387, 8394, 8401,
8408, 8415, 8422, 8429, 8436, 8443, 8450, 8457, 8464, 8471, 8478,
8485, 8492, 8499, 8506, 8513, 8520, 8527, 8534, 8541, 8548, 8555,
8562, 8569, 8576, 8583, 8590, 8597, 8604, 8611, 8618, 8625, 8632,
8639, 8646, 8653, 8660, 8667, 8674, 8681, 8688, 8695, 8702, 8709,
8716, 8723, 8730, 8737, 8744, 8751, 8758, 8765, 8772, 8779, 8786,
8793, 8800, 8807, 8814, 8821, 8828, 8835, 8842, 8849, 8856, 8863,
8870, 8877, 8884, 8891, 8898, 8905, 8912, 8919, 8926, 8933, 8940,
8947, 8954, 8961, 8968, 8975, 8982, 8989, 8996, 9003, 9010, 9017,
9024, 9031, 9038, 9045, 9052, 9059, 9066, 9073), class = "Date"),
x = c(-0.034038302, 0.122310949, -0.002797319, 0.026515253,
0.039961798, 0.034473263, 0.00549937, -0.024125944, 0.000132490000000001,
0.011038357, -0.02135072, 0.030663311, -0.008915551, 0.004855042,
0.01563688, -0.007397493, 0.013569146, -0.004968811, -0.00250391,
0.014624532, 0.036937453, -0.023685917, 0.018921356, -0.003066779,
-0.009217771, 0.005317513, 0.010378968, 0.001580798, -0.015085972,
-0.000121644000000001, 0.020468644, 0.007925229, 0.007721276,
-0.003123545, -0.018317891, -0.014900591, 0.003260844, -0.001565358,
-0.014833886, 0.00366766, 0.014297139, -0.00725552, 0.012207931,
0.024035152, -0.024195095, -0.0043564, 0.000847468, 0.033031596,
0.023685033, 0.025143071, 0.046264348, 0.038285177, -0.009180356,
-0.01630399, -0.010131294, -0.009939386, -0.007620427, 0.013062259,
0.009912238, 0.000192973, -0.01683559, -0.002627549, 0.019836063,
-0.019946159, -0.020124331, 0.012921737, 0.034604405, -0.020774015,
0.00334805, 0.002271156, -0.018676732, 0.019160923, -0.01945997,
-0.014342636, -0.004867796, -0.010002446, -0.004372991, 0.023164369,
0.019824112, -0.00321832, -0.015785746, 0.040836652, 0.00148831,
0.012084485, -0.009603897, -0.004642148, -0.008399234, 0.010463218,
0.000256571000000001, -0.01978405, -0.003439498, -0.015669975,
0.026180724, 0.020373255, 0.019160773, 0.00692683, 0.010215506,
0.010861939, 0.012041143, 0.025734568, -0.004828156, 0.006914552,
-0.00720089, -0.000538489999999999, -0.008479448, 0.022926604,
0.002131842, -0.003688597, 0.025325639, -0.009562293, -0.024336741,
0.012907537, 0.004339383, 0.010744364, -0.013058765, -0.003672014,
-0.023887493, 0.01062259, 0.02088054, -0.035249878, -0.001462821,
0.01904368, -0.001308787, 0.009203217, 0.019856479, 0.011296979,
0.010039545, -0.01559142, 0.006083419, -0.017958978, -0.007488063,
0.01236649, -0.004459064, -0.004375386, 0.025500722, 0.005557851,
0.008444321, 0.002827649, 0.020320308, 0.031611803, -0.010199803,
-0.009425874, 0.007942729, -2.59379999999999e-05, 0.016669077,
-0.011666062, 0.022835386, -0.025599107, 0.013562535, -0.018365192,
0.018148786, 0.016649144, -0.009530455, 0.012996597, 0.002034778,
-0.005926478, -0.004897238, -0.004419719, 0.010848926, -0.006039757,
-0.030287605, 0.019221837, 0.001808161, -0.009566133, 0.005009292,
0.005365023, -0.004879922, -0.024637933, -0.0186584, 0.004786059,
-0.008245254, -0.000106243, -0.001714888, -0.017804006, -0.021200061,
0.003812757, 0.021940886, 0.002270448, -0.015417493, -0.045754612,
-0.003468442, -0.006242659, 0.022383824, -0.018753927, 0.008577571,
0.008655048, 0.02374636, 0.029522811, 0.009946946, 0.015419714,
-0.016714623, -0.014616188, 0.019670855, -0.038979063, 0.020491563,
-0.009640674, 0.046051144, -0.021434575, 0.000190443999999998,
-0.029013969), id = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56,
57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71,
72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86,
87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124,
125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136,
137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160,
161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172,
173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184,
185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196,
197, 198, 199, 200)), .Names = c("date", "x", "id"), row.names = 53:252, class = "data.frame")

Set format df$date as character:
df$date <- as.character(df$date)
and then:
df %>% ggvis(~date, ~x) %>% layer_bars()

Related

How to identify parameters for SARIMA model in R

Part 2 Boston
plot(boston, ylab=" Boston crime data", xlab= "Time")
#Time series seem to have homogeneous variance upon visual inspection
#Q2
#Trend looks linear in the plot, so for trend differencing operator take d=1
newboston= as.numeric(unlist(boston))
xdiff = diff(newboston)
plot(xdiff)
#Q3
#ADF
library(tseries)
adf.test(xdiff)
#From the result, alternative hypothesis is stationary so null hypothesis is rejected
#KPSS test
install.packages('fpp3', dependencies = TRUE)
library ( fpp3 )
unitroot_kpss(xdiff)
#the p-value is >0.05, so fail to reject null hypothesis for KPSS
#Q4
library(astsa)
acf2(xdiff, max.lag = 50)
model1 = sarima(xdiff, p, 1, q)
So this is what I have tried so far. I am quite new to R and so do be kind if my workings make little sense. For context, Boston is the data I imported from an excel, that is simply a column of x axis data.
Firstly, I am trying to do Q4, but I am not sure how I would go about to find p and q.
Second, I am unsure whether what I did in Q2 to detrend my data is correct in the first place.
Here is the output of dput(boston)
dput(boston)
structure(list(x = c(41, 39, 50, 40, 43, 38, 44, 35, 39, 35,
29, 49, 50, 59, 63, 32, 39, 47, 53, 60, 57, 52, 70, 90, 74, 62,
55, 84, 94, 70, 108, 139, 120, 97, 126, 149, 158, 124, 140, 109,
114, 77, 120, 133, 110, 92, 97, 78, 99, 107, 112, 90, 98, 125,
155, 190, 236, 189, 174, 178, 136, 161, 171, 149, 184, 155, 276,
224, 213, 279, 268, 287, 238, 213, 257, 293, 212, 246, 353, 339,
308, 247, 257, 322, 298, 273, 312, 249, 286, 279, 309, 401, 309,
328, 353, 354, 327, 324, 285, 243, 241, 287, 355, 460, 364, 487,
452, 391, 500, 451, 375, 372, 302, 316, 398, 394, 431, 431),
y = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75,
76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
117, 118)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-118L))

FCT_Collapse using a range

Im trying to use a range (160:280) instead of '160', '161' and so on. How would i do that?
group_by(disp = fct_collapse(as.character(disp), Group1 = c(160:280), Group2 = c(281:400)) %>%
summarise(meanHP = mean(hp)))
Error: Problem adding computed columns in `group_by()`.
x Problem with `mutate()` column `disp`.
i `disp = `%>%`(...)`.
x Each input to fct_recode must be a single named string. Problems at positions: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 17```
For range of values it is better to use cut where you can define breaks and labels.
library(dplyr)
library(forcats)
mtcars %>%
group_by(disp = cut(disp, c(0, 160, 280, 400, Inf), paste0('Group', 1:4))) %>%
summarise(meanHP = mean(hp))
# disp meanHP
# <fct> <dbl>
#1 Group1 93.1
#2 Group2 143
#3 Group3 217.
#4 Group4 217.
So here 0-160 becomes 'Group1', 160-280 'Group2' and so on.
With fct_collapse you can do -
mtcars %>%
group_by(disp = fct_collapse(as.character(disp), Group1 = as.character(160:280), Group2 = as.character(281:400))) %>%
summarise(meanHP = mean(hp)) %>%
suppressWarnings()
However, this works only for exact values which are present so 160 would be in group1 but not 160.1.
We could also do
library(dplyr)
library(stringr)
mtcars %>%
group_by(disp = cut(disp, c(0, 160, 280, 400, Inf), strc('Group', 1:4))) %>%
summarise(meanHP = mean(hp))

In R, how can I properly subset a data frame based on a list of values inside of a function?

I have a function that is attempting to select rows from a dataframe based on a list of values.
For instance, some values might be:
> subset_ids
[1] "JUL_0003_rep1" "JUL_0003_rep2"
[3] "JUL_0003_rep3" "JUL_0007_rep1"
[5] "JUL_0007_rep2" "JUL_0007_rep3"
I have a data frame called "targets" with a column called "LongName". It has many other columns but no big deal. I want to select the rows from targets when LongName is in subset ids.
I can do this fine with either:
targets[is.element(targets$LongName, subset_ids),]
or
targets[targets$LongName %in% subset_ids,]
The problem is that I want to do this in a function, and I don't know what the column will be called in advance.
So I tried using the eval/parse method, which upon recent reading may not be the best way to do it. When I do the following:
sub1 <- paste("targets[is.element(targets$", column_name, ", subset_ids),]", sep="")
targets_subset <- as.character(eval(parse(text = sub1)))
It returns some strange concatenation of row numbers. It looks like this:
[1] "c(5, 6, 7, 17, 18, 19, 26, 27, 28, 35, 36, 46, 47, 48, 54, 55, 61, 62, 63, 64, 73, 74, 75, 76, 77, 78, 91, 92, 93, 102, 103, 104, 114, 117, 118, 129, 136, 137, 140, 141, 151, 152, 153, 157, 158, 159, 169, 172, 173, 183, 187, 188, 199, 200, 201, 208, 209, 210, 232, 233, 241, 242, 243, 252, 253, 254, 264, 265, 270, 271, 285, 286, 296, 297, 298)"
[2] "c(5, 6, 7, 17, 18, 19, 26, 27, 28, 35, 36, 46, 47, 48, 54, 55, 61, 62, 63, 64, 73, 74, 75, 76, 77, 78, 91, 92, 93, 102, 103, 104, 114, 117, 118, 129, 136, 137, 140, 141, 151, 152, 153, 157, 158, 159, 169, 172, 173, 183, 187, 188, 199, 200, 201, 208, 209, 210, 232, 233, 241, 242, 243, 252, 253, 254, 264, 265, 270, 271, 285, 286, 296, 297, 298)"
[3] "c(3, 3, 3, 7, 7, 7, 11, 11, 11, 15, 15, 19, 19, 19, 22, 22, 26, 26, 27, 27, 31, 31, 31, 32, 32, 32, 39, 39, 39, 43, 43, 43, 47, 49, 49, 53, 57, 57, 59, 59, 63, 63, 63, 65, 65, 65, 70, 72, 72, 76, 78, 78, 83, 83, 83, 86, 86, 86, 97, 97, 100, 100, 100, 104, 104, 104, 108, 108, 111, 111, 117, 117, 121, 121, 121)"
So 5, 6, 7, 17 ... appear to be the right rows for the target i'm trying to pick, but I don't understand why it sent this back in the first place, or what item [3] is at all.
If I manually execute the line generated by the above "sub1 <- ...", then it returns the proper data. If I ask the function to do it, it returns this garbage.
My question is two-fold. 1: Why is the data being returned this way? 2: Is there a better way than eval/parse to do what I'm trying to do?
I suspect some strange scope or environment level issue, but it is unclear to me at this point. I appreciate any advice anyone has.
The data are returned that way because you are coercing the dataframe to a character object. Try
as.character(head(targets))
to see a short example.
So, your method works if you eliminate the as.character(). Here it is as a MWE:
targets <- data.frame(LongName = sample(letters, 1000, replace = TRUE),
SeqNum= 1:1000,
X = rnorm(1000))
subset_ids <- c("a","f")
targets[is.element(targets$LongName, subset_ids),]
targets[targets$LongName %in% subset_ids,]
testfun <- function(targets, column_name, subset_ids){
sub1 <- paste("targets[is.element(targets$", column_name, ", subset_ids),]", sep="")
targets_subset <- eval(parse(text = sub1))
return(targets_subset)
}
testfun(targets, column_name = "LongName", subset_ids)

Reverse leaf order in dendrogram using R

I have tried for several days to just flip a dendrogram so that the last gene is the first in the figure and the first the last. But even when I have managed to move leaves around the internal ordering is not the same. Here is my script:
cluster.hosts <- read.table("Norm_0_to1_heatmap.txt", header = TRUE, sep="", quote="/", row.names = 1)
# A table with 8 columnns and 229 rows cirresponding to gene expression
hosts.dist <- dist(cluster.hosts, method = "euclidean", diag = FALSE, upper = FALSE, p = 2)
hc <- hclust(hosts.dist, method = "average")
dd <- as.dendrogram(hc)
order.dendrogram(dd)
X11()
par(cex=0.5,font=3)
plot(dd, main="Dendrogram of Syn9 genes")
order.dd <- order.dendrogram(dd) #the numbers in the order indicate the position of the gene in the original table
#Then I generate a vector with the opposed order to the one obtained
y <- c(206, 204, 210, 209, 213, 212, 211, 207, 208, 94, 199, 192, 195, 198, 193, 201, 203, 200, 185, 61, 191, 190, 197, 189, 188, 196, 187, 215, 214, 202, 217, 220, 219, 218, 95, 180, 179, 181, 182, 186, 178, 132, 133, 122, 66, 65, 64, 58, 91, 88, 92, 89, 62, 184, 103, 128, 127, 229, 231, 230, 148, 63, 228, 116, 134, 104, 221, 78, 20, 232, 160, 159, 225, 112, 167, 164, 166, 140, 222, 51, 149, 227, 79, 68, 90, 131, 130, 136, 135, 105, 147, 172, 150, 176, 175, 174, 177, 152, 151, 165, 137, 168, 163, 52, 146, 141, 145, 82, 81, 56, 161, 120, 144, 129, 84, 1, 173, 143, 142, 86, 85, 83, 194, 183, 111, 55, 53, 54, 224, 171, 170, 223, 169, 93, 59, 60, 123, 121, 124, 87, 125, 226, 3, 158, 47, 10, 162, 138, 139, 154, 153, 119, 118, 117, 106, 80, 45, 70, 69, 126, 205, 77, 67, 19, 102, 46, 13, 108, 107, 109, 72, 71, 73, 23, 22, 25, 57, 48, 216, 155, 29, 24, 101, 35, 113, 115, 36, 37, 114, 110, 2, 14, 6, 16, 15, 17, 18, 74, 31, 30, 76, 12, 75, 8, 11, 5, 7, 99, 98, 100, 39, 38, 33, 32, 97, 96, 49, 44, 34, 50, 156, 26, 157, 42, 41, 43, 4, 28, 27, 9, 40, 21)
rx <- reorder(dd, y, agglo.FUN=mean)
order.rx <- order.dendrogram(rx)
write(order.rx, file="order_hosts_rx.txt", sep="\t")
write(labels(rx), file="labels_order_hosts_rx.txt", sep="\t")
X11()
par(cex=0.5)
plot(rx, main="Dendrogram of Syn9 genes")
I guess it has something to do with the heights of the leaves but I just want to flip the dendrogram...
Thanks in advance!
Miguel
You can use rev(dd); rev.dendrogram simply returns the dendrogram with reversed nodes:
hc <- hclust(dist(USArrests), "ave")
dd <- as.dendrogram(hc)
plot(dd)
plot(rev(dd))

Create new dataset removing variables with high inflation factors

I have a dataset of environmental variables I would like to use for a GLMM. I am using the corvif function from the AED package (http://www.highstat.com/Book2/AED_1.0.zip) to identify and remove variables with high inflation factors.
Instead of removing one variable at a time manually from my dataset with a GVIF values > 3 (highest value removed first), I would like to know how to write a loop to accomplish this task automatically with the result being a new dataset with only the remaining variables (i.e. those with GVIF values < 3).
Any suggestions for how to approach this problem for a new R user?
Here is my sample data:
WW_Covs <- structure(list(Latitude = c(62.4419, 67.833333, 65.95, 63.72935,
60.966667, 60.266667, 55.660455, 62.216667, 61.3, 61.4, 62.084139,
55.662566, 64.48508, 63.208354, 62.87591, 62.70856, 62.64009,
63.79488, 59.55, 62.84206), BIO_02 = c(87, 82, 75, 70, 77, 70,
59, 84, 84, 79, 85, 60, 91, 87, 74, 74, 76, 70, 76, 74), BIO_03 = c(26,
23, 25, 26, 25, 24, 25, 25, 26, 25, 26, 26, 24, 25, 24, 25, 25,
25, 26, 24), BIO_04 = c(8443, 9219, 7594, 6939, 7928, 7593, 6160,
8317, 8167, 7972, 8323, 6170, 9489, 8578, 7814, 7680, 7904, 7149,
7445, 7803), BIO_05 = c(201, 169, 151, 166, 194, 210, 202, 205,
204, 186, 205, 200, 200, 195, 170, 154, 180, 166, 219, 170),
BIO_06 = c(-131, -183, -144, -102, -107, -75, -26, -119,
-113, -120, -120, -28, -169, -143, -131, -142, -124, -111,
-72, -129), BIO_08 = c(128, 109, 85, 78, 122, 145, 153, 134,
130, 126, 132, 152, 120, 119, 115, 98, 124, 104, 147, 115
), BIO_09 = c(-31, -81, -16, 13, -60, -6, 25, -25, -25, -70,
-25, 23, -56, -39, -47, -60, -39, 8, 0, -46), BIO_12 = c(667,
481, 760, 970, 645, 557, 645, 666, 652, 674, 670, 670, 568,
598, 650, 734, 620, 868, 571, 658), BIO_13 = c(78, 77, 96,
109, 85, 70, 67, 77, 84, 93, 78, 68, 72, 78, 93, 99, 90,
96, 72, 93), BIO_15 = c(23, 40, 25, 21, 36, 30, 21, 24, 28,
34, 24, 22, 28, 29, 34, 32, 36, 22, 30, 34), BIO_19 = c(147,
85, 180, 236, 108, 119, 154, 149, 135, 118, 148, 162, 117,
119, 120, 141, 111, 204, 111, 122)), .Names = c("Latitude",
"BIO_02", "BIO_03", "BIO_04", "BIO_05", "BIO_06", "BIO_08", "BIO_09",
"BIO_12", "BIO_13", "BIO_15", "BIO_19"), row.names = c(1:20), class = "data.frame")
Sample code:
library(AED)
WW_Final <- corvif(WW_Covs)
test <- corvif(WW_Covs])
test[order(-test$GVIF), ]
if(test$GVIF[1,] > 3, # this is where I get stuck...
Here is an algorithm for doing this. I illustrate with the built-in dataset longley, and I also use function vif in package car, rather than using package AED:
It's not pretty, and should be wrapped inside a function, but I leave that as an exercise for the interested reader.
The code:
library(car)
dat <- longley
cutoff <- 2
flag <- TRUE
while(flag){
fit <- lm(Employed ~ ., data=dat)
vfit <- vif(fit)
if(max(vfit) > cutoff){
dat <- dat[, -which.max(vfit)]
} else {
flag <- FALSE
}
}
print(fit)
print(vfit)
The output:
Call:
lm(formula = Employed ~ ., data = dat)
Coefficients:
(Intercept) Unemployed Armed.Forces
50.66281 0.02265 0.02847
Unemployed Armed.Forces
1.032501 1.032501

Resources