I have a dataframe with lat, long and dozens of other columns. Basically, I want to write a loop where X=Long,Y=Lat remains constant, but the color changes in every loop. The color is basically the other columns, one for every plot. How can I do this?
library(maps)
library(ggplot2)
library(RColorBrewer)
library(reshape)
usamap <- ggplot2::map_data("state")
myPalette <- colorRampPalette(rev(brewer.pal(11, "Spectral")))
simplefun<-function(colname){
ggplot()+
geom_polygon( data=usamap, aes(x=long, y=lat, group=group),colour="black",fill="white")+
geom_point(data=stat,aes_string(x=stat$longitude,y=stat$latitude,color=colname))+
scale_colour_gradientn(name="name",colours = myPalette(10))+
xlab('Longitude')+
ylab('Latitude')+
coord_map(projection = "mercator")+
theme_bw()+
theme(line = element_blank())+
theme(legend.position = c(.93,.20),panel.grid.major = element_line(colour = "#808080"))+
ggsave(paste0(colname,".png"),width=10, height=8,dpi=300)
}
colname<-names(stat[4:16])
lapply(colname,simplefun)
dput(droplevels(stat))
structure(list(siteId = structure(1:16, .Label = c("US1NYAB0001",
"US1NYAB0006", "US1NYAB0010", "US1NYAB0021", "US1NYAB0023", "US1NYAB0028",
"US1NYAB0032", "US1NYAL0002", "US1NYBM0004", "US1NYBM0007", "US1NYBM0011",
"US1NYBM0014", "US1NYBM0021", "US1NYBM0024", "US1NYBM0032", "US1NYBM0034"
), class = "factor"), latitude = c(42.667, 42.7198, 42.5455,
42.6918, 42.6602, 42.7243, 42.5754, 42.2705, 42.0296, 42.0493,
42.0735, 42.3084, 42.0099, 42.1098, 42.1415, 42.0826), longitude = c(-74.0509,
-73.9304, -74.1475, -73.8311, -73.8103, -73.757, -73.7995, -77.9419,
-76.0213, -76.0288, -75.9296, -75.9569, -75.5142, -75.8858, -75.889,
-75.9912), no = c(2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 1L), min_obs = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0), min_mod = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0), avg_obs = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0.15,
0, 0, 0, 0, 0, 0), avg_mod = c(3136.8388671875, 2997.28173828125,
3258.61840820312, 2970.74340820312, 2992.9765625, 0, 3075.54443359375,
2701.03662109375, 2974.23413085938, 2967.5029296875, 3004.57861328125,
2965.07470703125, 3260.25463867188, 3028.55590820312, 2981.8876953125,
0), max_obs = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0.3, 0, 0, 0, 0, 0,
0), max_mod = c(6273.677734375, 5994.5634765625, 6517.23681640625,
5941.48681640625, 5985.953125, 0, 6151.0888671875, 5402.0732421875,
5948.46826171875, 5935.005859375, 6009.1572265625, 5930.1494140625,
6520.50927734375, 6057.11181640625, 5963.775390625, 0), mean_bias = c(0,
0, 0, 0, 0, NaN, 0, 0, 0, 5.05475490855863e-05, 0, 0, 0, 0, 0,
NaN), corr_coef = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, NA,
NA, NA, NA, NA, NA), additive_bias = c(-6273.677734375, -5994.5634765625,
-6517.23681640625, -5941.48681640625, -5985.953125, 0, -6151.0888671875,
-5402.0732421875, -5948.46826171875, -5934.705859375, -6009.1572265625,
-5930.1494140625, -6520.50927734375, -6057.11181640625, -5963.775390625,
0), mean_error = c(-3136.8388671875, -2997.28173828125, -3258.61840820312,
-2970.74340820312, -2992.9765625, 0, -3075.54443359375, -2701.03662109375,
-2974.23413085938, -2967.3529296875, -3004.57861328125, -2965.07470703125,
-3260.25463867188, -3028.55590820312, -2981.8876953125, 0), mean_abs_error = c(3136.8388671875,
2997.28173828125, 3258.61840820312, 2970.74340820312, 2992.9765625,
0, 3075.54443359375, 2701.03662109375, 2974.23413085938, 2967.3529296875,
3004.57861328125, 2965.07470703125, 3260.25463867188, 3028.55590820312,
2981.8876953125, 0), rmse = c(4436.16006895562, 4238.79648453055,
4608.38234747949, 4201.26561821133, 4232.7080465523, 0, 4349.47664966936,
3819.84262201718, 4206.20224553428, 4196.4707575116, 4249.11582411849,
4193.24886413303, 4610.69632679956, 4283.02483978603, 4217.02602018439,
0)), .Names = c("siteId", "latitude", "longitude", "no", "min_obs",
"min_mod", "avg_obs", "avg_mod", "max_obs", "max_mod", "mean_bias",
"corr_coef", "additive_bias", "mean_error", "mean_abs_error",
"rmse"), row.names = c(NA, -16L), class = "data.frame")
I had the same problem and I solve it like this:
Let's assume ggplotdata in my code is like your dataframe (with more than two columns) in the second post. (dput(droplevels(stat))?)
library(reshape) # package for melting data
shapes <- 1:ncol(ggplot_data) # number of shapes
ggplot_data <- melt(ggplot_data, id = "X1") # melt data together
p1 <- ggplot(ggplot_data, aes(X1,value))
p1 <- p1 +aes(shape = factor(variable))+ # different shapes
geom_point(aes(colour = factor(variable)))+ # different colors
scale_shape_manual(labels=colname, values = shapes)+ # same for the legend
scale_color_manual(labels=colname, values = mypalette) # same for legend
Related
I have the following dataframe for which I am trying to calculate the precision of observations by group.
df<- structure(list(BLG = c(77.634011090573, 119.341563786008, 12.0603015075377,
0, 155.275381552754, 117.391304347826, 81.1332904056665, 3.96563119629874,
91.566265060241), GSF = c(11.090573012939, 4.11522633744856,
0, 0, 0, 0, 0, 0, 0), LMB = c(73.9371534195933, 28.8065843621399,
24.1206030150754, 20.2360876897133, 59.721300597213, 13.0434782608696,
38.6349001931745, 31.7250495703899, 28.9156626506024), YLB = c(14.7874306839187,
4.11522633744856, 0, 0, 0, 0, 0, 0, 0), BLC = c(7.39371534195933,
0, 0, 20.2360876897133, 3.9814200398142, 0, 0, 7.93126239259749,
9.63855421686747), WHC = c(0, 0, 0, 0, 3.9814200398142, 0, 0,
0, 0), RSF = c(0, 0, 0, 0, 11.9442601194426, 0, 0, 0, 4.81927710843374
), CCF = c(0, 0, 0, 0, 0, 0, 0, 0, 0), BLB = c(0, 0, 0, 0, 0,
0, 0, 0, 0), group = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L)), row.names = c(NA,
-9L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x00000270a7061ef0>)
I am trying to find the precision with this formula:
Y_estimated= the value of in each cell of df
Y_true= y_true<- c(83, 10, 47, 8, 9, 6, 12, 5, 8) #the true value for each column in df
R= number of observations in each group (in this case=3)
After applying the formula, I should have 3 measures of precision for each column. But I am unsure of how to make this formula into a function that will do this. Specifically the applying the epsilon by group and defining R.
I've been working on the following:
estimate = function(df, y_true) {
R = 3
y_estimated = (df, .SD)
(sum((sqrt( (y_estimated - y_true)^2 / 3))) / y_true) * 100
}
But apart from this throwing errors (I think from the .SD in the y_estimated), I have to manually put in the value of R which I hope to not have to do given that this will be applied on data frames with multiple group sizes.
Any help would be greatly appreciated.
I'm trying to create a bar plot using ggplot2 and my data is in this format:
dput here:
structure(list(clade = structure(c(1L, 3L, 2L, 3L, 2L, 2L), .Label = c("19A",
"20A", "20B", "20E (EU1)", "20I (Alpha, V1)", "20J (Gamma, V3)",
"21J (Delta)"), class = "factor"), C.T = c(0, 4, 4, 4, 4, 4),
A.G = c(0, 1, 1, 1, 1, 1), G.A = c(0, 2, 0, 2, 0, 0), G.C = c(0,
1, 0, 1, 0, 0), T.C = c(0, 0, 0, 0, 0, 0), C.A = c(0, 0,
0, 0, 0, 0), G.T = c(0, 0, 0, 0, 0, 0), A.T = c(0, 0, 0,
0, 0, 0), T.A = c(0, 0, 0, 0, 0, 0), T.G = c(0, 0, 0, 0,
0, 0), A.C = c(0, 0, 0, 0, 0, 0), C.G = c(0, 0, 0, 0, 0,
0), A.del = c(0, 0, 0, 0, 0, 0), TAT.del = c(0, 0, 0, 0,
0, 0), TCTGGTTTT.del = c(0, 0, 0, 0, 0, 0), TACATG.del = c(0,
0, 0, 0, 0, 0), AGTTCA.del = c(0, 0, 0, 0, 0, 0), GATTTC.del = c(0,
0, 0, 0, 0, 0)), row.names = c(NA, -6L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x0000014b25a51ef0>)
I'd like to create 7 bar plots (one for each "clade") where the X axis would have the columns of the data frame (C.T would be 1 bar, A.G would be another bar, etc) and the Y axis would be the count. Essentially, for each clade, print a barplot with the counts of column.
For example, for the bar plot of the clade "20B" and the bar name "C.T" the count would be the sum of the values from the data frame. Can I do that in this wide format? Do I need to transform the data to a long format instead?
I was trying to apply this SO answer: Plotting error bar on bar chart for a data frame in wide format using ggplot but I keep getting choose another strategy with names_repair
Thank you in advance, any help is very welcome!
I have a problem using the descdist() function of the fitdistrplus package on my data frame. I think the problem comes from the data type: double. I would like to avoid the conversion to double when importing my csv, but keeping the data as a numeric (I apparently cannot convert them back using as.numeric, it remains as double after that).
Here is my code to import the dataset:
setwd("[directory]")
data=read.csv('data_BehCoor.csv', header=T, sep=";", dec=".", fill=T)
require("fitdistrplus")
descdist(data$stateTSp)
returns the following error
Error in plot.window(...) : 'xlim' needs finite values
An idea of the data:
dput(head(data))
structure(list(day = c(2L, 2L, 2L, 2L, 2L, 2L),
trial = c(1L, 1L, 1L, 1L, 1L, 1L), ID = structure(c(2L, 2L, 3L, 3L, 4L, 4L),
.Label = c("", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L"),
class = "factor"),
condition = structure(c(3L, 3L, 3L, 3L, 2L, 2L), .Label = c("",
"C", "T"), class = "factor"), gender = structure(c(3L, 2L,
3L, 2L, 3L, 2L), .Label = c("", "F", "M"), class = "factor"),
TSp = c(5, 3, 1, 5, 0, 6), AGR = c(3, 0, 0, 0, 0, 0), beAGR = c(0,
3, 0, 0, 0, 0), FOR = c(27.729, 24.51, 51.459, 37.645, 34.489,
34.281), FOR_noTSp = c(22.729, 21.51, 50.459, 32.645, 34.489,
28.281), NI = c(39.857, 82.421, 76.922, 9.277, 265.484, 249.692
), stateTSp = c(55.858, 21.607, 0, 79.961, 0, 2.001), TSpFOR = c(20.345,
8.408, 0, 0, 0, 0), tot_duration = c(136.967, 136.967, 128.395,
128.395, 300, 300), OL_FOR = c(3.746, 3.746, 5.002, 5.002,
10.081, 10.081), OL_FOR_stateTSp = c(4.563, 10.907, 41.703,
0, 0, 0), OL_FOR_TSpFOR = c(3.372, 1.113, 0, 0, 0, 0), OL_FOR_NI = c(11.041,
8.496, 2.748, 27.639, 19.655, 18.191), OL_stateTSp_FOR = c(10.907,
4.563, 0, 41.703, 0, 0), OL_stateTSp = c(3.249, 3.249, 0,
0, 0, 0), OL_stateTSp_TSpFOR = c(4.034, 0, 0, 0, 0, 0),
OL_stateTSp_NI = c(36.66,
11.79, 0, 37.249, 0, 2.001), OL_TSpFOR_FOR = c(1.113, 3.372,
0, 0, 0, 0), OL_TSpFOR_stateTSp = c(0, 4.034, 0, 0, 0, 0),
OL_TSpFOR = c(0, 0, 0, 0, 0, 0), OL_TSpFOR_NI = c(18.23,
2.499, 0, 0, 0, 0), overlap_NI_FOR = c(8.496, 11.041, 27.639,
2.748, 18.191, 19.655), OL_NI_stateTSp = c(11.79, 36.66,
37.249, 0, 2.001, 0), OL_NI_TSpFOR = c(2.499, 18.23, 0, 0,
0, 0), OL_NI = c(16.065, 16.065, 6.528, 6.528, 230.021, 230.021
), AGR_in_FOR = c(0, 0, 0, 0, 0, 0), AGR_in_stateTSp = c(0,
0, 0, 0, 0, 0), AGR_in_TSpFOR = c(0, 0, 0, 0, 0, 0), AGR_in_NI = c(3,
0, 0, 0, 0, 0), beAGR_in_FOR = c(0, 0, 0, 0, 0, 0), beAGR_in_stateTSp = c(0,
0, 0, 0, 0, 0), beAGR_in_TSpFOR = c(0, 0, 0, 0, 0, 0), beAGR_in_NI = c(0,
3, 0, 0, 0, 0), comment = structure(c(1L, 1L, 1L, 1L, 1L,
1L),
.Label = c("", "moved the plate too fast"), class = "factor")),
row.names = c(NA, 6L), class = "data.frame")
Thanks in advance
fitdistrplus::descdist works fine with type double, see below:
foo <- runif(50, min = 1, max = 100)
typeof(foo)
fitdistrplus::descdist(foo)
I have two matrices sourced from the same dataset but with different amounts of data available for each.
I want to create a dataset that is a replicate of x in terms of column names and row names but which contains the data values in y. If the data is not available then an NA would be used as the value for that coordinate.
Not all of the row names in x are present in y and vice versa. The same holds true for the column names.
For the example input data I've given below, the rownames in x corresponding to those in y are the rowname start and end at | (I want to retain everthing after the | for other mappings).
What is the most efficient way to do this?
DESIRED OUTPUT
z = structure(c(NA, 1, NA, NA, NA, NA, NA, NA, 0, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0, NA, NA, NA, NA, NA,
NA, 0, NA, NA, NA, 0, NA, NA, NA, NA, NA, NA, 0, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), .Dim = c(11L, 5L), .Dimnames = list(
c("AACSL|729522", "AACS|65985", "AADACL2|344752", "AADACL3|126767",
"AADACL4|343066", "AADAC|13", "AADAT|51166", "AAGAB|79719",
"AAK1|22848", "AAK12|14", "AANAT|15"), c("S18", "S20", "S45",
"S95", "S100")))
EXAMPLE INPUT
x = structure(c(0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0,
0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0), .Dim = c(11L,
5L), .Dimnames = list(c("AACSL|729522", "AACS|65985", "AADACL2|344752",
"AADACL3|126767", "AADACL4|343066", "AADAC|13", "AADAT|51166",
"AAGAB|79719", "AAK1|22848", "AAK12|14", "AANAT|15"), c("S18",
"S20", "S45", "S95", "S100")))
y = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
0, 0, 0, 0, 0, 0), .Dim = c(11L, 4L), .Dimnames = list(c("A1BG",
"A1CF", "A2ML1", "A4GALT", "AACS", "AAK1", "AARD", "AARS2", "AASDHPPT",
"AASS", "BAACS"), c("S18", "S10", "S45", "S95")))
I think there might be a slight problem with the example that you provided, i can not see how the z is coming from the x and y above.. see this code:
intersect(sapply(rownames(x), #I am just extracting the letter codes here
function(i){
return(
strsplit(x=i,split="|",fixed=TRUE)[[1]][[1]])
}),rownames(y))
#[1] "AACS" "AAK1"
weird, right? I mean, there is only 2 codes in y compared to x. However, I think the code below does what you are planning (with the exception of this inconsistency):
library(data.table)
library(reshape2)
library(dplyr)
x %>% as.data.frame %>% mutate(rownames=rownames(x)) %>%
mutate(nms=sapply(rownames(x),
function(i){
return(
strsplit(x=i,split="|",fixed=TRUE)[[1]][[1]])
})) %>%
melt(id.vars=c("nms","rownames")) %>%
merge(., y %>% as.data.frame %>% mutate(nms=rownames(y))%>% melt(id.vars="nms"), by=c("variable","nms"), all.x=TRUE) %>%
select(-nms, -value.x) %>% dcast(formula = rownames~variable, value.var="value.y") -> xy
#now put back the column names where they belong
rownames(xy)<-xy$rownames
#now the only thing left is to arrange the columns
xy[rownames(x),colnames(x)] -> xy
Or am I wrong in understanding some of your points?
I have tried to follow different answers here but none worked. I went through the plotly official documentation and came up with following:
Data
Following is a sample of the data set:
> dput(head(df))
structure(list(ID = c(-1, -1, -1, -1, -1, -1), spacing.ft = c(0,
0, 0, 0, 0, 0), gap.s = c(0, 0, 0, 0, 0, 0), frspacing.ft = c(0,
0, 0, 0, 0, 0), TTC = c(0, 0, 0, 0, 0, 0), LV.vel.fps = c(0,
0, 0, 0, 0, 0), x = c(0, 0, 0, 0, 0, 0), y = c(0, 0, 0, 0, 0,
0), z = c(0, 0, 0, 0, 0, 0), frames = 29373:29378, df16 = c(6L,
6L, 6L, 6L, 6L, 6L), ADO.name = structure(c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_), .Label = c("BlueT5",
"ghtTFrei10", "ilT6Carg", "owT8Yell", "CargoT4", "MoveT12", "RaceT11",
"RedT1", "SemiT3", "StarT7", "WhiteT2", "artTWalm9"), class = "factor"),
speed.fps.ED = c(33.25, 33.4, 33.55, 33.7, 33.84, 33.99),
deltaV.fps = c(33.25, 33.4, 33.55, 33.7, 33.84, 33.99)), .Names = c("ID",
"spacing.ft", "gap.s", "frspacing.ft", "TTC", "LV.vel.fps", "x",
"y", "z", "frames", "df16", "ADO.name", "speed.fps.ED", "deltaV.fps"
), row.names = c(NA, 6L), class = "data.frame")
What I want to do:
I want to customize the tooltip to add speed, speed.fps.ED. I tried following:
library(ggplot2)
library(plotly)
mt.plot <- ggplot() +
geom_point(data = df,
mapping = aes(x = deltaV.fps, y = frspacing.ft, color = ADO.name))
# Build the ggplot:
p <- plotly_build(mt.plot)
# Change the tooltip:
p$data[[1]]$text <- paste("ED.speed = ", df$speed.fps.ED)
p$filename <- 'test'
r <- plotly_POST(p)
knit_print.plotly(r, options=list())
You can see the resulting plot here: Plot.
Problem
The problem is that the third element in the tooltip is displayed only for 1 ADO.name i.e. BlueT5. I want it to be visible for all ADO.names. What is the problem here?
You can add speed.fps.ED to the ggplot aesthetic, as in:
geom_point(data = df,
aes(x = deltaV.fps, y = frspacing.ft, color = ADO.name, label = speed.fps.ED))
See also: how to choose variable to display in tooltip when using ggplotly?