Related
I've got a batch of survey data that I'd like to be able to subset on a few specific columns which have 0-10 scale data (e.g. Rank your attitude towards x as 0 to 10) so that I can plot using using ggplot() + facet_grid. Faceting will be using 3 hi/med/low bins calculated as +1 / -1 standard deviation above the mean. I have working code, which splits the overall dataframe into 3 parts like so:
# Generate sample data:
structure(list(Q4 = c(2, 3, 3, 5, 4, 3), Q5 = c(1, 3, 3, 3, 2,
2), Q6 = c(4, 3, 3, 3, 4, 4), Q7 = c(4, 2, 3, 5, 5, 5), Q53_1 = c(5,
8, 4, 5, 4, 5)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
# Aquire Q53_1 data as factors
political_scale <- factor(climate_experience_data$Q53_1, levels = c(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
# Generate thresholds based on mean and standard deviation thresholds
low_threshold <- round(mean(as.numeric(political_scale, na.rm = T)) - sd(as.numeric(political_scale)), digits = 0)
high_threshold <- round(mean(as.numeric(political_scale, na.rm = T)) + sd(as.numeric(political_scale)), digits = 0)
# Generate low/med/high bins based on Mean and SD
political_lr_low <- filter(climate_experience_data, Q53_1 <= low_threshold)
political_lr_mid <- filter(climate_experience_data, Q53_1 < high_threshold & Q53_1 > low_threshold)
political_lr_high <- filter(climate_experience_data, Q53_1 >= high_threshold)
What I've realised is that this approach really doesn't lend itself to faceting. What I suspect is that I need to use a combination of mutate() across() where() and group_by() to add data to a new column Q53_scale with "hi" "med" "low" based on where Q53_1 values fall in relation to those low/high thresholds (e.g. SD +1 over mean and -1 under mean). My first few dozen attempts have fallen short - has anyone managed to use sd() to bin data for faceting in this way?
library(tidyverse)
climate_experience_data <- structure(list(Q4 = c(2, 3, 3, 5, 4, 3), Q5 = c(
1, 3, 3, 3, 2,
2
), Q6 = c(4, 3, 3, 3, 4, 4), Q7 = c(4, 2, 3, 5, 5, 5), Q53_1 = c(
5,
8, 4, 5, 4, 5
)), row.names = c(NA, -6L), class = c(
"tbl_df",
"tbl", "data.frame"
))
climate_experience_data %>%
mutate(
bin = case_when(
Q53_1 > mean(Q53_1) + sd(Q53_1) ~ "high",
Q53_1 < mean(Q53_1) - sd(Q53_1) ~ "low",
TRUE ~ "medium"
) %>% factor(levels = c("low", "medium", "high"))
) %>%
ggplot(aes(Q4, Q5)) +
geom_point() +
facet_grid(~bin)
Created on 2022-03-10 by the reprex package (v2.0.0)
I have some data that comes from the measurement of an image where essentially the columns signify position (x) and height (z) data. The problem is that this data gets spit out as a .csv file in the wide format. I am trying to find a way to convert this to the long format but I'm unsure how to do this because I can't designate an identifier.
I know there are a lot of questions on reshaping data but I didn't find anything quite like this.
As an example:
df <- data.frame(V1 = c("Profile", "x", "[m]", 0, 2, 4, 6, 8, 10, 12, NA, NA),
V2 = c("1", "z", "[m]", 3, 3, 4, 10, 12, 9, 2, NA, NA),
V3 = c("Profile", "x", "[m]", 0, 2, 4, 6, NA, NA, NA, NA, NA),
V4 = c("2", "z", "[m]", 4, 8, 10, 10, NA, NA, NA, NA, NA),
V5 = c("Profile", "x", "[m]", 0, 2, 4, 6, 8, 10, 12, 14, 17),
V2 = c("3", "z", "[m]", 0, 1, 1, 10, 14, 11, 6, 2, 0))
Every two columns represents X,Z data (you can see grouped by Profile 1, Profile 2, Profile 3, etc). However, measurements are not equal lengths, hence the rows with NAs. Is there a programmatic way to reshape this data into the long form? i.e.:
profile x z
Profile 1 0 3
Profile 1 2 3
Profile 1 4 4
... ... ...
Profile 2 0 4
Profile 2 2 8
Profile 2 4 10
... ... ...
Thank you in advance for your help!
You can do the following (Its a bit verbose, feel free to optimize):
dfcols <- NCOL(df)
xColInds <- seq(1,dfcols,by=2)
zColInds <- seq(2,dfcols,by=2)
longdata <- do.call("rbind",lapply(1:length(xColInds), function(i) {
xValInd <- xColInds[i]
zValInd <- zColInds[i]
profileName <- paste0(df[1,xValInd]," ",df[1,zValInd])
xVals <- as.numeric(df[-(1:3),xValInd])
zVals <- as.numeric(df[-(1:3),zValInd])
data.frame(profile=rep(profileName,length(xVals)),
x = xVals,
z = zVals)
}))
If you want it more performant, dont cast to data.frame every single iteration. One cast at the end is enough, like:
xColInds <- seq(1,NCOL(df),by=2)
longdataList <- lapply(xColInds, function(xci) {
list(profileName = paste0(df[1,xci]," ",df[1,xci+1]),
x = df[-(1:3),xci],
z = df[-(1:3),xci+1])
})
longdata <- data.frame(profile = rep(unlist(lapply(longdataList,"[[","profileName")),each=NROW(df)-3),
x = as.numeric(unlist(lapply(longdataList,"[[","x"))),
z = as.numeric(unlist(lapply(longdataList,"[[","z"))))
I am attempting to run a clustering algorithm over a list of dissimilarity matrices for different numbers of clusters k and extract some information for each run.
This first block of code produces the list of dissimilarity matrices
library(tidyverse)
library(cluster)
library(rje)
dat=mtcars[,1:3]
v_names=names(dat)
combos=rje::powerSet(v_names)
combos=combos[lengths(combos)>1]
df_list=list()
for (i in seq_along(combos)){
df_list[[i]]=dat[combos[[i]]]
}
gower_ls=lapply(df_list,daisy,metric="gower")
Here is the section of code I am having a problem with
set.seed(4)
model_num <-c(NA)
sil_width <-c(NA)
min_sil<-c(NA)
mincluster<-c(NA)
k_clusters <-c(NA)
lowest_sil <-c(NA)
maxcluster <-c(NA)
model_vars <- c(NA)
clust_4=lapply(gower_ls,pam,diss=TRUE,k=4)
for(m in 1:length(clust_4)){
sil_width[m] <-clust_4[[m]][7]$silinfo$avg.width
min_sil[m] <- min(clust_4[[m]][7]$silinfo$clus.avg.widths)
mincluster[m] <-min(clust_4[[m]][6]$clusinfo[,1])
maxcluster[m] <-max(clust_4[[m]][6]$clusinfo[,1])
k_clusters[m]<- nrow(clust_4[[m]][6]$clusinfo)
lowest_sil[m]<-min(clust_4[[m]][7]$silinfo$widths)
model_num[m] <-m
}
colresults_4=as.data.frame(cbind( sil_width, min_sil,mincluster,maxcluster,k_clusters,model_num,lowest_sil))
How can I convert this piece of code to run for a given range of k? I've tried a nested loop but I was not able to code it correctly. Here are the desired results for k= 4:6, thanks.
structure(list(sil_width = c(0.766467312788453, 0.543226669407726,
0.765018469447229, 0.705326458357873, 0.698351173575526, 0.480565022092276,
0.753366365875066, 0.644345251543097, 0.699437672202048, 0.430310752506775,
0.678224885117295, 0.576411380463116), min_sil = c(0.539324315243191,
0.508330909368204, 0.637090842537915, 0.622120627356455, 0.539324315243191,
0.334047777245833, 0.430814518122641, 0.568591550281139, 0.539324315243191,
0.295113900268025, 0.430814518122641, 0.19040716086259), mincluster = c(5,
3, 4, 5, 2, 3, 3, 3, 2, 3, 3, 3), maxcluster = c(14, 12, 11,
14, 12, 10, 11, 11, 9, 6, 7, 7), k_clusters = c(4, 4, 4, 4, 5,
5, 5, 5, 6, 6, 6, 6), model_num = c(1, 2, 3, 4, 1, 2, 3, 4, 1,
2, 3, 4), lowest_sil = c(-0.0726256983240229, 0.0367238314801671,
0.308069836672298, 0.294247157041013, -0.0726256983240229, -0.122804288130541,
-0.317748917748917, 0.218164082936686, -0.0726256983240229, -0.224849074123824,
-0.317748917748917, -0.459909237820881)), row.names = c(NA, -12L
), class = "data.frame")
I was able to come up with a solution by writing a function clus_func that extracts the cluster information and then using cross2 and map2 from the purrr package:
library(tidyverse)
library(cluster)
library(rje)
dat=mtcars[,1:3]
v_names=names(dat)
combos=rje::powerSet(v_names)
combos=combos[lengths(combos)>1]
clus_func=function(x,k){
clust=pam(x,k,diss=TRUE)
clust_stats=as.data.frame(cbind(
avg_sil_width=clust$silinfo$avg.width,
min_clus_width=min(clust$silinfo$clus.avg.widths),
min_individual_sil=min(clust$silinfo$widths[,3]),
max_individual_sil=max(clust$silinfo$widths[,3]),
mincluster= min(clust$clusinfo[,1]),
maxcluster= max(clust$clusinfo[,1]),
num_k=max(clust$clustering) ))
}
df_list=list()
for (i in seq_along(combos)){
df_list[[i]]=dat[combos[[i]]]
}
gower_ls=lapply(df_list,daisy,metric="gower")
begin_k=4
end_k=6
cross_list=cross2(gower_ls,begin_k:end_k)
k=c(NA)
for(i in 1:length(cross_list)){ k[i]=cross_list[[i]][2]}
diss=c(NA)
for(i in 1:length(cross_list)){ diss[i]=cross_list[[i]][1]}
model_stats=map2(diss, k, clus_func)
model_stats=rbindlist(model_stats)
I am trying to reproduce some of the examples given by the ROI creators.
For example in http://statmath.wu.ac.at/courses/optimization/Presentations/ROI-2011.pdf (slides 15-17) there is the example:
library("ROI")
#ROI: R Optimization Infrastructure
#Installed solver plugins: cplex, lpsolve, glpk, quadprog, symphony, nlminb.
#Default solver: glpk.
(constr1 <- L_constraint(c(1, 2), "<", 4))
#An object containing 1 linear constraints.
(constr2 <- L_constraint(matrix(c(1:4), ncol = 2), c("<", "<"), c(4, 5)))
#An object containing 2 linear constraints.
rbind(constr1, constr2)
#An object containing 3 linear constraints.
(constr3 <- Q_constraint(matrix(rep(2, 4), ncol = 2), c(1, 2), "<", 5))
#An object containing 1 constraints.
#Some constraints are of type quadratic.
foo <- function(x) {sum(x^3) - seq_along(x) %*% x}
F_constraint(foo, "<", 5)
lp <- LP(objective = c(2, 4, 3), L_constraint(L = matrix(c(3, 2, 1, 4, 1, 3, 2, 2, 2), nrow = 3), dir = c("<=", "<=", "<="), rhs = c(60, 40, 80)), maximum = TRUE)
qp <- QP(Q_objective(Q = diag(1, 3), L = c(0, -5, 0)), L_constraint(L = matrix(c(-4, -3, 0, 2, 1, 0, 0, -2, 1), ncol = 3, byrow = TRUE), dir = rep(">=", 3), rhs = c(-8, 2, 0)))
When I run it I get the errors
Error in LP(objective = c(2, 4, 3), L_constraint(L = matrix(c(3, 2, 1, :
could not find function "LP"
and
Error in QP(Q_objective(Q = diag(1, 3), L = c(0, -5, 0)), L_constraint(L = matrix(c(-4, :
could not find function "QP"
In fact the functions are not in ROI's namespace. e.g.
ROI::LP
Error: 'LP' is not an exported object from 'namespace:ROI'
The same syntax appears in other examples I found on the web but the functions LP and QP are never defined.
I am using ROI 0.3.0
Can someone tell me what is going wrong?
The commands LP and QP were both changed to OP.
library("ROI")
## ROI: R Optimization Infrastructure
## Registered solver plugins: nlminb, alabama, cbc, cccp, clp, deoptim, ecos, glpk, ipop, lpsolve, msbinlp, neos, nloptr, ucminf, spg, cgm, vmm, bobyqa, newuoa, uobyqa, hjk, nmk, lbfgs, optimx, qpoases, quadprog, scs, symphony.
## Default solver: auto.
(constr1 <- L_constraint(c(1, 2), "<", 4))
## An object containing 1 linear constraint.
(constr2 <- L_constraint(matrix(c(1:4), ncol = 2), c("<", "<"), c(4, 5)))
## An object containing 2 linear constraints.
rbind(constr1, constr2)
## An object containing 3 linear constraints.
(constr3 <- Q_constraint(matrix(rep(2, 4), ncol = 2), c(1, 2), "<", 5))
## An object containing 0 linear constraints
## 1 quadratic constraint.
foo <- function(x) {sum(x^3) - seq_along(x) %*% x}
F_constraint(foo, "<", 5)
## An object containing 1 nonlinear constraint.
lp <- OP(objective = c(2, 4, 3),
L_constraint(L = matrix(c(3, 2, 1, 4, 1, 3, 2, 2, 2), nrow = 3),
dir = c("<=", "<=", "<="),
rhs = c(60, 40, 80)), maximum = TRUE)
qp <- OP(Q_objective(Q = diag(1, 3), L = c(0, -5, 0)),
L_constraint(L = matrix(c(-4, -3, 0, 2, 1, 0, 0, -2, 1), ncol = 3, byrow = TRUE),
dir = rep(">=", 3), rhs = c(-8, 2, 0)))
The slides you refer to are outdated. The new documentation is on http://roi.r-forge.r-project.org !
I have some data, much of which is NA.
For simplicity, let's say it looks like this:
x = c(NA, 3, 4, 3.5, NA, NA, NA, NA, 7, 5)
bins = c(1, 1, 1, 2, 2, 2, 3, 3, 4, 4)
I'm using ave( ) and which.min( ) to get the minimum value for each bin type:
ave(x, segments, FUN = which.min)
But I get the error:
Error in `split<-.default`(`*tmp*`, g, value = lapply(split(x, g), FUN)) :
replacement has length zero
The reason this is happening (I think) is because bin # 3 only has NA values. When this is rectified, the error disappears. I could just use a function like:
ave(x, segments, FUN = function(xx){
if(all(is.na(xx))){
return(NA)
} else {
xx = which.min(xx)
return(xx)
}}
)
But:
1) that is hacky as heck. And
2) which.min(c(NA, NA, NA)) does not cause an error, nor does ave(c(NA, NA, NA), c(1, 1, 1), FUN=mean) - so what's going on that I'm missing?
--> Anybody have an idea of why this error happens / the best way to get around it?
Cheers.