Related
I am generating a landscape pattern that evolves over time. The problem with the code is that I have clearly defined a window for the object bringing up the error but the window is not being recognised. I also do not see how any points are falling outside of the window, or how that would make a difference.
library(spatstat)
library(dplyr)
# Define the window
win <- owin(c(0, 100), c(0, 100))
# Define the point cluster
cluster1 <- rMatClust(kappa = 0.0005, scale = 0.1, mu = 20,
win = win, center = c(5,5))
# define the spread of the points
spread_rate <- 1
new_nests_per_year<-5
years<-10
# Plot the initial cluster
plot(win, main = "Initial cluster")
points(cluster1, pch = 20, col = "red")
newpoints<-list()
# Loop for n years
for (i in 1:years) {
# Generate new points that spread from the cluster
newpoints[[1]] <-rnorm(new_nests_per_year, mean = centroid.owin(cluster1)$y, sd = spread_rate)
newpoints[[2]] <-rnorm(new_nests_per_year, mean = centroid.owin(cluster1)$x, sd = spread_rate)
# Convert the list to a data frame
newpoints_df <- data.frame(newpoints)
# Rename the columns of the data frame
colnames(newpoints_df) <- c("x", "y")
# Combine the new points with the existing points
cluster1_df <- data.frame(cluster1)
newtotaldf<-bind_rows(cluster1_df,newpoints_df)
cluster1<-as.ppp(newtotaldf, x = newtotaldf$x, y = newtotaldf$y,
window = win)
# Plot the updated cluster
plot(win, main = paste("Cluster after year", i))
points(cluster1, pch = 20, col = "red")
}
However, when I run line:
cluster1<-as.ppp(newtotaldf, x = newtotaldf$x, y = newtotaldf$y,
window = win)
I recieve the error:
Error: x,y coords given but no window specified
Why would this be the case?
In your code, if you use the command W = win it should solve the issue. I also believe you can simplify the command without specifying x and y:
## ...[previous code]...
cluster1 <- as.ppp(newtotaldf, W = win)
plot(win)
points(cluster1, pch = 20, col = "red")
I downloaded a monthly data from [NASA data][1] and saved in .txt and .asc format. I am trying to plot and extract the data from the ASCII file, but unfortunately I am unable to do so. I tried the following:
1.
infile <- "OMI/L3feb09.txt"
data <- as.matrix(read.table(infile, skip = 3, header = FALSE, sep = "\t"))
data[data == -9999] = NA
rr <- raster(data, crs = "+init=epsg:4326")
extent(rr) = c(179.375, 179.375+1.25*288, -59.5, -59.5+1*120)
Tried to extract for australia
adm <- getData("GADM", country="AUS", level=1)
rr = mask(rr, adm)
plot(rr)
library(rgdal)
r = raster("OMI/L3feb09.txt")
plot(r)
library(raster)
r = raster("OMI/L3feb09.txt")
plot(r)
4.Also tried,
df1 <- read.table("OMI/L3feb09.txt", skip = 11, header = FALSE, sep = "\t")
Tried the following from
Stackoverflow link 1
Stackoverflow link 2
The problem is there are strings in the file in between number, such as "lat = -55.5"
Appreciate any kind of help. Thank you
[2]: https://stackoverflow.com/questions/42064943/opening-an-ascii-file-using-r
So, I downloaded one file and played around with it! It is not the best solution, however, I hope it can give you an idea.
library(stringr)
# read data
data<-read.csv("L3_tropo_ozone_column_oct04",header = FALSE, skip = 3,sep = "")
# this "" will seperate lat = -59.5 to 3 rows, and will be easier to remove.
#Also each row in the data frame constrained by 2 rows of "lat", represents #data on the later "lat".
lat_index<-which(data[,1]=="lat")
#you need the last row that contains data not "lat string
lat_index<-lat_index-1
#define an empty array for results.
result<-array(NA, dim = c(120,288),dimnames = list(lat=seq(-59.5,59.5,1),
lon=seq(-179.375,179.375,1.25)))
I assumed data -on 3 three digits- on each latituide is dividable by 3 resulting in 288, which equals the lon grid number. Correct me if I'm wrong.
# function to split a string into a vector in which each string has three letter/numbers
split_n_parts<-function(input_string,n){
# dislove it to many elements or by number
input_string_1<-unlist(str_extract_all(input_string,boundary("character")))
output_string<-vector(length = length(input_string_1)/n)
for ( x in 1:length(output_string)){
output_string[x]<-paste0(input_string_1[c(x*3-2)],
input_string_1[c(x*3-1)],
input_string_1[c(x*3)])
}
return(as.numeric(output_string))
}
Here, the code loops, collects, write each lat data in the result array
# loop over rows constrainted by 2 lats, process it and assign to an array
for (i in 1:length(lat_index)){
if(i ==1){
for(j in 1:lat_index[i]){
if(j==1){
row_j<-paste0(data[j,])
}else{
row_j<-paste0(row_j,data[j,])
}
}
}else{
ii<-i-1
lower_limit<-lat_index[ii]+4
upper_limit<-lat_index[i]
for(j in lower_limit:upper_limit){
if(j==lower_limit){
row_j<-paste0(data[j,])
}else{
row_j<-paste0(row_j,data[j,])
}
}
}
result[i,]<-split_n_parts(row_j,3)
}
Here, is the final array and image
#plot as image
image(result)
EDIT: To continue the solution and put the end-result:
# because data is IN DOBSON UNITS X 10
result<-result/10
#melt to datafrome
library(plyr)
result_df<-adply(result, c(1,2))
result_df$lat<-as.numeric(as.character(result_df$lat))
result_df$lon<-as.numeric(as.character(result_df$lon))
# plotting
library(maps)
library(ggplot2)
library(tidyverse)
world_map <- map_data("world")
#colors
jet.colors <-colorRampPalette(c("white", "cyan", "#7FFF7F", "yellow", "#FF7F00", "red", "#7F0000"))
ggplot() +
geom_raster(data=result_df,aes(fill=V1,x=lon,y=lat))+
geom_polygon(data = world_map, aes(x = long, y = lat, group = group),
fill=NA, colour = "black")+
scale_x_continuous(expand = c(0, 0)) +
scale_y_continuous(expand = c(0, 0))+
scale_fill_gradientn(colors = jet.colors(7))
Given a data frame containing mixed variables (i.e. both categorical and continuous) like,
digits = 0:9
# set seed for reproducibility
set.seed(17)
# function to create random string
createRandString <- function(n = 5000) {
a <- do.call(paste0, replicate(5, sample(LETTERS, n, TRUE), FALSE))
paste0(a, sprintf("%04d", sample(9999, n, TRUE)), sample(LETTERS, n, TRUE))
}
df <- data.frame(ID=c(1:10), name=sample(letters[1:10]),
studLoc=sample(createRandString(10)),
finalmark=sample(c(0:100),10),
subj1mark=sample(c(0:100),10),subj2mark=sample(c(0:100),10)
)
I perform unsupervised feature selection using the package FactoMineR
df.princomp <- FactoMineR::FAMD(df, graph = FALSE)
The variable df.princomp is a list.
Thereafter, to visualize the principal components I use
fviz_screeplot() and fviz_contrib() like,
#library(factoextra)
factoextra::fviz_screeplot(df.princomp, addlabels = TRUE,
barfill = "gray", barcolor = "black",
ylim = c(0, 50), xlab = "Principal Component",
ylab = "Percentage of explained variance",
main = "Principal Component (PC) for mixed variables")
factoextra::fviz_contrib(df.princomp, choice = "var",
axes = 1, top = 10, sort.val = c("desc"))
which gives the following Fig1
and Fig2
Explanation of Fig1: The Fig1 is a scree plot. A Scree Plot is a simple line segment plot that shows the fraction of total variance in the data as explained or represented by each Principal Component (PC). So we can see the first three PCs collectively are responsible for 43.8% of total variance. The question now naturally arises, "What are these variables?". This I have shown in Fig2.
Explanation of Fig2: This figure visualizes the contribution of rows/columns from the results of Principal Component Analysis (PCA). From here I can see the variables, name, studLoc and finalMark are the most important variables that can be used for further analysis.
Further Analysis- where I'm stuck at: To derive the contribution of the aforementioned variables name, studLoc, finalMark. I use the principal component variable df.princomp (see above) like df.princomp$quanti.var$contrib[,4]and df.princomp$quali.var$contrib[,2:3].
I've to manually specify the column indices [,2:3] and [,4].
What I want: I want to know how to do dynamic column index assignment, such that I do not have to manually code the column index [,2:3] in the list df.princomp?
I've already looked at the following similar questions 1, 2, 3 and 4 but cannot find my solution? Any help or suggestions to solve this problem will be helpful.
Not sure if my interpretation of your question is correct, apologies if not. From what I gather you are using PCA as an initial tool to show you what variables are the most important in explaining the dataset. You then want to go back to your original data, select these variables quickly without manual coding each time, and use them for some other analysis.
If this is correct then I have saved the data from the contribution plot, filtered out the variables that have the greatest contribution, and used that result to create a new data frame with these variables alone.
digits = 0:9
# set seed for reproducibility
set.seed(17)
# function to create random string
createRandString <- function(n = 5000) {
a <- do.call(paste0, replicate(5, sample(LETTERS, n, TRUE), FALSE))
paste0(a, sprintf("%04d", sample(9999, n, TRUE)), sample(LETTERS, n, TRUE))
}
df <- data.frame(ID=c(1:10), name=sample(letters[1:10]),
studLoc=sample(createRandString(10)),
finalmark=sample(c(0:100),10),
subj1mark=sample(c(0:100),10),subj2mark=sample(c(0:100),10)
)
df.princomp <- FactoMineR::FAMD(df, graph = FALSE)
factoextra::fviz_screeplot(df.princomp, addlabels = TRUE,
barfill = "gray", barcolor = "black",
ylim = c(0, 50), xlab = "Principal Component",
ylab = "Percentage of explained variance",
main = "Principal Component (PC) for mixed variables")
#find the top contributing variables to the overall variation in the dataset
#here I am choosing the top 10 variables (although we only have 6 in our df).
#note you can specify which axes you want to look at with axes=, you can even do axes=c(1,2)
f<-factoextra::fviz_contrib(df.princomp, choice = "var",
axes = c(1), top = 10, sort.val = c("desc"))
#save data from contribution plot
dat<-f$data
#filter out ID's that are higher than, say, 20
r<-rownames(dat[dat$contrib>20,])
#extract these from your original data frame into a new data frame for further analysis
new<-df[r]
new
#finalmark name studLoc
#1 53 b POTYQ0002N
#2 73 i LWMTW1195I
#3 95 d VTUGO1685F
#4 39 f YCGGS5755N
#5 97 c GOSWE3283C
#6 58 g APBQD6181U
#7 67 a VUJOG1460V
#8 64 h YXOGP1897F
#9 15 j NFUOB6042V
#10 81 e QYTHG0783G
Based on your comment, where you said you wanted to 'Find variables with value greater than 5 in Dim.1 AND Dim.2 and save these variables to a new data frame', I would do this:
#top contributors to both Dim 1 and 2
f<-factoextra::fviz_contrib(df.princomp, choice = "var",
axes = c(1,2), top = 10, sort.val = c("desc"))
#save data from contribution plot
dat<-f$data
#filter out ID's that are higher than 5
r<-rownames(dat[dat$contrib>5,])
#extract these from your original data frame into a new data frame for further analysis
new<-df[r]
new
(This keeps all the original variables in our new data frame since they all contributed more than 5% to the total variance)
There are a lot of ways to extract contributions of individual variables to PCs. For numeric input, one can run a PCA with prcomp and look at $rotation (I spoke to soon and forgot you've got factors here so prcomp won't work directly). Since you are using factoextra::fviz_contrib, it makes sense to check how that function extracts this information under the hood. Key factoextra::fviz_contrib and read the function:
> factoextra::fviz_contrib
function (X, choice = c("row", "col", "var", "ind", "quanti.var",
"quali.var", "group", "partial.axes"), axes = 1, fill = "steelblue",
color = "steelblue", sort.val = c("desc", "asc", "none"),
top = Inf, xtickslab.rt = 45, ggtheme = theme_minimal(),
...)
{
sort.val <- match.arg(sort.val)
choice = match.arg(choice)
title <- .build_title(choice[1], "Contribution", axes)
dd <- facto_summarize(X, element = choice, result = "contrib",
axes = axes)
contrib <- dd$contrib
names(contrib) <- rownames(dd)
theo_contrib <- 100/length(contrib)
if (length(axes) > 1) {
eig <- get_eigenvalue(X)[axes, 1]
theo_contrib <- sum(theo_contrib * eig)/sum(eig)
}
df <- data.frame(name = factor(names(contrib), levels = names(contrib)),
contrib = contrib)
if (choice == "quanti.var") {
df$Groups <- .get_quanti_var_groups(X)
if (missing(fill))
fill <- "Groups"
if (missing(color))
color <- "Groups"
}
p <- ggpubr::ggbarplot(df, x = "name", y = "contrib", fill = fill,
color = color, sort.val = sort.val, top = top, main = title,
xlab = FALSE, ylab = "Contributions (%)", xtickslab.rt = xtickslab.rt,
ggtheme = ggtheme, sort.by.groups = FALSE, ...) + geom_hline(yintercept = theo_contrib,
linetype = 2, color = "red")
p
}
<environment: namespace:factoextra>
So it's really just calling facto_summarize from the same package. By analogy you can do the same thing, simply call:
> dd <- factoextra::facto_summarize(df.princomp, element = "var", result = "contrib", axes = 1)
> dd
name contrib
ID ID 0.9924561
finalmark finalmark 21.4149175
subj1mark subj1mark 7.1874438
subj2mark subj2mark 16.6831560
name name 26.8610132
studLoc studLoc 26.8610132
And that's the table corresponding to your figure 2. For PC2 use axes = 2 and so on.
Regarding "how to programmatically determine the column indices of the PCs", I'm not 100% sure I understand what you want, but if you just want to say for column "finalmark", grab its contribution to PC3 you can do the following:
library(tidyverse)
# make a tidy table of all column names in the original df with their contributions to all PCs
contribution_df <- map_df(set_names(1:5), ~factoextra::facto_summarize(df.princomp, element = "var", result = "contrib", axes = .x), .id = "PC")
# get the contribution of column 'finalmark' by name
contribution_df %>%
filter(name == "finalmark")
# get the contribution of column 'finalmark' to PC3
contribution_df %>%
filter(name == "finalmark" & PC == 3)
# or, just the numeric value of contribution
filter(contribution_df, name == "finalmark" & PC == 3)$contrib
BTW I think ID in your example is treated as numeric instead of factor, but since it's just an example I'm not bothering with it.
I was trying to draw some lines in the same plot. The x factor is determined by a date and the y factor by a number. I initially load the data, store it in a list and save the min and max values for the date:
stocks <- list()
stocks.min <- 0
stocks.max <- 0
stocks.min.date <- NULL
stocks.max.date <- NULL
for (name in names(files))
{
stocks[[name]] <- read.csv(files[[name]], sep=";")
# Convert to date in R
stocks[[name]]$Date <- as.Date(stocks[[name]]$Date, "%d/%m/%Y")
# Sets max value for ylim in the plotting
if (stocks.max < max(stocks[[name]]$Close))
{
stocks.max <- max(stocks[[name]]$Close)
}
# Sets the date value for the xlim in the plot
if (is.null(stocks.min.date) || min(stocks[[name]]$Date) < stocks.min.date)
{
stocks.min.date <- min(stocks[[name]]$Date)
}
if (is.null(stocks.max.date) || max(stocks[[name]]$Date) > stocks.max.date)
{
stocks.max.date <- max(stocks[[name]]$Date)
}
}
After that I create an empty plot using the values from above:
plot(0, xlab="Time", ylab="Closing Prices", main="Stock Values",
xlim=c(stocks.min.date, stocks.max.date), ylim=c(stocks.min, stocks.max))
And then I add the lines with the data:
for (name in names(stocks))
{
lines(x=stocks[[name]]$Date, y=stocks[[name]]$Close, col=colors[[name]], type="l",
lwd=2)
}
When the graph is plotted, the data is correctly displayed, but it shows the date as numbers instead of dates in the x axis as seen in the image below:
How can I correct this issue?
I would strongly suggest using a normalized series to plot the stocks data you have. quantmod helps a lot here. It solves two purposes -
Get the x-axis labels as dates.
Normalize series so that you can view any number of series without worrying about the orders of their absolute values (~67 for INR, ~1120 for KRW, so on...)
This is what I generally use for my purposes.
library(quantmod)
tickers <- c('GOOG', 'MSFT', 'AAPL', 'AMZN')
getSymbols(tickers, src = 'yahoo', from = '2015-01-01')
normalise <- function(x) x/as.numeric(x)[1] - 1
chart_theme <- chart_theme()
chart_theme$col$line.col <- "red"
chart_Series(normalise(Cl(GOOG)), theme = chart_theme)
add_TA(normalise(Cl(MSFT)), on = 1, col = "black", lty = 1)
add_TA(normalise(Cl(AMZN)), on = 1, col = "blue", lty =1)
add_TA(normalise(Cl(AAPL)), on = 1, col = "darkgreen", lty =2)
Hope this helps.
This question already has answers here:
Shading a kernel density plot between two points.
(5 answers)
Closed 7 years ago.
I've written code to plot density data for variations of an A/B test. I'd like to improve the visual by shading (with the fill being slightly transparent) the area below each curve. I'm currently using matplot, but understand ggplot might be a better option.
Any ideas? Thanks.
# Setup data frame - these are results from an A/B experiment
conv_data = data.frame(
VarNames = c("Variation 1", "Variation 2", "Variation 3") # Set variation names
,NumSuccess = c(1,90,899) # Set number of successes / conversions
,NumTrials = c(10,100,1070) # Set number of trials
)
conv_data$NumFailures = conv_data$NumTrials - conv_data$NumSuccess # Set number of failures [no conversions]
num_var = NROW(conv_data) # Set total number of variations
plot_col = rainbow(num_var) # Set plot colors
get_density_data <- function(n_var, s, f) {
x = seq(0,1,length.out=100) # 0.01,0.02,0.03...1
dens_data = matrix(data = NA, nrow=length(x), ncol=(n_var+1))
dens_data[,1] = x
# set density data
for(j in 1:n_var) {
# +1 to s[], f[] to ensure uniform prior
dens_data[,j+1] = dbeta(x, s[j]+1, f[j]+1)
}
return(dens_data)
}
density_data = get_density_data(num_var, conv_data$NumSuccess, conv_data$NumFailures)
matplot(density_data[,1]*100, density_data[,-1], type = "l", lty = 1, col = plot_col, ylab = "Probability Density", xlab = "Conversion Rate %", yaxt = "n")
legend("topleft", col=plot_col, legend = conv_data$VarNames, lwd = 1)
This produces the following plot:
# Setup data frame - these are results from an A/B experiment
conv_data = data.frame(
VarNames = c("Variation 1", "Variation 2", "Variation 3") # Set variation names
,NumSuccess = c(1,90,899) # Set number of successes / conversions
,NumTrials = c(10,100,1070) # Set number of trials
)
conv_data$NumFailures = conv_data$NumTrials - conv_data$NumSuccess # Set number of failures [no conversions]
num_var = NROW(conv_data) # Set total number of variations
plot_col = rainbow(num_var) # Set plot colors
get_density_data <- function(n_var, s, f) {
x = seq(0,1,length.out=100) # 0.01,0.02,0.03...1
dens_data = matrix(data = NA, nrow=length(x), ncol=(n_var+1))
dens_data[,1] = x
# set density data
for(j in 1:n_var) {
# +1 to s[], f[] to ensure uniform prior
dens_data[,j+1] = dbeta(x, s[j]+1, f[j]+1)
}
return(dens_data)
}
density_data = get_density_data(num_var, conv_data$NumSuccess, conv_data$NumFailures)
matplot(density_data[,1]*100, density_data[,-1], type = "l",
lty = 1, col = plot_col, ylab = "Probability Density",
xlab = "Conversion Rate %", yaxt = "n")
legend("topleft", col=plot_col, legend = conv_data$VarNames, lwd = 1)
## and add this part
for (ii in seq_along(plot_col))
polygon(c(density_data[, 1] * 100, rev(density_data[, 1] * 100)),
c(density_data[, ii + 1], rep(0, nrow(density_data))),
col = adjustcolor(plot_col[ii], alpha.f = .25))
Was able to answer own question with:
df = as.data.frame(t(conversion_data))
dfs = stack(df)
ggplot(dfs, aes(x=values)) + geom_density(aes(group=ind, colour=ind, fill=ind), alpha=0.3)