Related
I needed to generate array or many data frames from other data frames which only varied in names. This required me to do a lot of copy-paste works. Is it possible that I can make it cleaner but not keep copying and pasting? Follows are two examples from many similar cases of the analysis I am doing now (I will provide codes for reproduction at the end of the question), which I think may be able to make them cleaner with the same approach.
case 1, create an array with data from per_d1,per_d1,per_d3,per_d4,per_d5
perd <- array(dim=c(7,15,5))
perd [,,1] <- as.matrix(per_d$per_d1)
perd [,,2] <- as.matrix(per_d$per_d2)
perd [,,3] <- as.matrix(per_d$per_d3)
perd [,,4] <- as.matrix(per_d$per_d4)
perd [,,5] <- as.matrix(per_d$per_d5)
case 2, create multiple data frames from data with similar names.
dataplot <- dfmak (per_d$per_d1,ge$per_d1$g1,ge$per_d1$g2,ge$per_d1$g3,ge$per_d1$g4,ge$per_d1$g5)
dataplot2 <- dfmak (per_d$per_d2,ge$per_d2$g1,ge$per_d2$g2,ge$per_d2$g3,ge$per_d2$g4,ge$per_d2$g5)
dataplot3 <- dfmak (per_d$per_d3,ge$per_d3$g1,ge$per_d3$g2,ge$per_d3$g3,ge$per_d3$g4,ge$per_d3$g5)
dataplot4 <- dfmak (per_d$per_d4,ge$per_d4$g1,ge$per_d4$g2,ge$per_d4$g3,ge$per_d4$g4,ge$per_d4$g5)
dataplot5 <- dfmak (per_d$per_d5,ge$per_d5$g1,ge$per_d5$g2,ge$per_d5$g3,ge$per_d5$g4,ge$per_d5$g5)
codes for reproduction
N <- 1
CS <- 10.141
S <- seq (7.72,13,0.807)
t <- 15
l <- length (S)
m0 <- 100
exps <- c(0.2, 0.5, 0.9, 1.5, 2)
sd <- c(0.2, 0.5, 0.8, 1.3, 1.8)
names(sd) <- paste("per", seq_along(sd), sep = "")
per <- lapply(sd, function(x){
per <- matrix(nrow = length(S)*N, ncol = t+1)
for (i in 1:dim(per)[1]) {
for (j in 1:t+1){
per [,1] <- replicate (n = N, S)
per [i,j] <- round (abs (rnorm (1, mean = per[i,1], sd =x)),digits=3)
colnames(per) <- c('physical',paste('t', 1:15, sep = ""))
per <- as.data.frame (per)
}
}
per <- per [,-1]
return(per)
}
)
per_d <- lapply(per, function(x){
per_d <- abs (x - 10.141)
}
)
names(per_d) <- paste("per_d", seq_along(sd), sep = "")
gefun <- function (i){
res <- lapply(exps, function(x){
g <- as.matrix (m0 * exp (-x * i))
for (i in 1:l) {
for (j in 1:t){
g [i,j] <- abs((round (rnorm(1,mean = g[i,j],sd=3), digits = 3)))
colnames(g) <- paste('t', 1:ncol(g), sep = "")
g <- as.data.frame(g)
}}
return(g)
}
)
}
ge <- lapply(per_d, gefun)
for (i in 1:length(ge)){
names(ge[[i]]) <- paste("g", seq_along(ge), sep = "")
}
dfmak <- function(df1,df2,df3,df4,df5,df6){
data.frame(stimulus = c (paste0('S',1:3),'CS+',paste0('S',5:7)),
phy_dis = S,
per_dis = c(df1$t1,df1$t2,df1$t3,df1$t4,df1$t5,df1$t6,df1$t7,df1$t8,df1$t9,df1$t10,df1$t11,df1$t12,df1$t13,df1$t14,df1$t15),
trials = rep(1:15, each = 7),
response_0.2 = c (df2$t1,df2$t2,df2$t3,df2$t4,df2$t5,df2$t6,df2$t7,df2$t8,df2$t9,df2$t10,df2$t11,df2$t12,df2$t13,df2$t14,df2$t15),
response_0.5 = c (df3$t1,df3$t2,df3$t3,df3$t4,df3$t5,df3$t6,df3$t7,df3$t8,df3$t9,df3$t10,df3$t11,df3$t12,df3$t13,df3$t14,df3$t15),
response_0.9 = c (df4$t1,df4$t2,df4$t3,df4$t4,df4$t5,df4$t6,df4$t7,df4$t8,df4$t9,df4$t10,df4$t11,df4$t12,df4$t13,df4$t14,df4$t15),
response_1.5 = c (df5$t1,df5$t2,df5$t3,df5$t4,df5$t5,df5$t6,df5$t7,df5$t8,df5$t9,df5$t10,df5$t11,df5$t12,df5$t13,df5$t14,df5$t15),
response_2 = c (df6$t1,df6$t2,df6$t3,df6$t4,df6$t5,df6$t6,df6$t7,df6$t8,df6$t9,df6$t10,df6$t11,df6$t12,df6$t13,df6$t14,df6$t15)
)
}
You can try the followings. But the codes, unfortunately, are not short.
Case 1
a <- lapply(per_d, as.matrix)
b <- c(a, recursive = TRUE)
pred <- array(b, dim = c(7,15,5))
Case 2
The data frames will be stored in a list. You still have to extract them using $ or [[]].
# create empty lists to store the outputs
out <- list()
name <- list()
for(i in 1:5) {
a <- per_d[[i]]
b <- ge[[i]][[1]]
c <- ge[[i]][[2]]
d <- ge[[i]][[3]]
e <- ge[[i]][[4]]
f <- ge[[i]][[5]]
arg <- list(a, b, c, d, e, f)
name[[i]] <- paste0("df_", i)
out[[i]] <- do.call(dfmak, arg)
}
out <- setNames(out, name)
I am getting the following error :
Error in x[i, j] : incorrect number of dimensions
while executing the following code :
library(GA)
library(readxl)
path <- "GAMS data & solution.xlsx"
c <- read_excel(path,range = "C3:G7",col_names = F)
f <- read_excel(path,range = "C10:G10",col_names = F)
d <- read_excel(path,range = "C13:G13",col_names = F)
cap <- read_excel(path,range = "C16:G16",col_names = F)
rows <- nrow(c)
cols <- ncol(c)[enter image description here][1]
val2 <-0
val1 <-0
fitness <- function(m){
x<-m[1]
y<-m[2]
# define fitness function
for(i in 1:rows){
for(j in 1:cols){
val <- c[i,j]*x[i,j]
val1 <- val1 + val
}
}
for(i in 1:rows){
val0 <- f[i]*y[i]
val2 <- val2+val0
}
fitness_value <- val1 + val2
#define constraint
g1 <- x
for(j in 1:cols){
for(i in 1:rows){
sum1 <- x[i,j]
sum2 <- sum2+sum1
}
gtemp <- sum2-d[j]
g2 <- append(g2,gtemp)
}
for(i in 1:rows){
for(j in 1:cols){
sum0 <- x[i,j]
sum3 <- sum3+sum0
}
gtemp1 <- sum3-cap[i]*y[i]
g3 <- append(g3,gtemp)
}
#penalized constraint violation
fitness_value <- ifelse( g1 >= 0 & g2 >= 0 & g3 <= 0 , fitness_value, fitness_value + 1e5 )
return(-fitness_value)
}
ga(type = "real-valued", fitness,lower = c(0,0),upper = c(10000, 1),maxiter = 1000, popSize = 100, monitor = F)
I am trying to solve the following problem using GA package of R:
Here is what my data looks like.
I'm trying to iterate over a list (within a list) and I am having some trouble, any help would be appreciated.
boston <- data.frame(Boston)
# Subset 1: zn - 2, chas-4, rm-6, dis-8, black-12,
# Subset 2: crim-1, indus-3, nox-5, age-7, tax-10, ptratio-11
# Subset 3: all
kvals <- c(1,3,5)
subset1 <- c("zn", "chas", "rm", "dis", "black")
subset2 <- c("crim", "indus", "nox", "age", "tax", "ptratio")
subset3 <- c(boston[,1:13])
x1.train <- boston[, c(subset1)]
x2.train <- boston[, c(subset2)]
x3.train <- boston[, 1:13]
y.train <- boston$medv01
xtrain.list <- list(x1.train, x2.train, x3.train)
for (j in kvals ){
message("~~~~ K = ", j, " ~~~~")
for (s in xtrain.list ){
knn.cv.pred <- knn.cv(xtrain.list[[s]],
y.train,
k = kvals[j])
message("Subset ", s, " K = ", j, " Error: ", mean(knn.cv.pred != y.train)*100, "%")
}
message("\n")
}
but i get this error
+ }
Error in xtrain.list[[s]] : invalid subscript type 'list'
***Updated with better reprex to call out appropriate libraries
In R you, one way to write a for loop is to use seq_along function inside the for loop.
boston <- data.frame(MASS::Boston)
# Subset 1: zn - 2, chas-4, rm-6, dis-8, black-12,
# Subset 2: crim-1, indus-3, nox-5, age-7, tax-10, ptratio-11
# Subset 3: all
kvals <- c(1,3,5)
subset1 <- c("zn", "chas", "rm", "dis", "black")
subset2 <- c("crim", "indus", "nox", "age", "tax", "ptratio")
subset3 <- c(boston[,1:13])
x1.train <- boston[, c(subset1)]
x2.train <- boston[, c(subset2)]
x3.train <- boston[, 1:13]
y.train <- boston$medv
xtrain.list <- list(x1.train, x2.train, x3.train)
for (k in seq_along(kvals) ){
message("~~~~ K = ", k, " ~~~~")
for (s in seq_along(xtrain.list) ){
knn.cv.pred <- class::knn.cv(xtrain.list[[s]],
y.train,
k = kvals[[k]])
message("Subset ", s, " K = ", k, " Error: ", mean(knn.cv.pred != y.train)*100, "%")
}
message("\n")
}
I need to plot the BIC value from each regression step in the step function using ggplot. I have no idea how to use ggplot to plot each steps BIC value.
form_model <- formula(lm(price~sqft_living+sqft_lot+waterfront+sqft_above+sqft_basement+years_since_renovations+age_of_house+grade_int+bed_int+bath_int+floors_dummy+view_dummy+condition_dummy+basement_dummy+renovated_dummy+weekend_dummy))
mod <- lm(price~1)
n <- (nrow(House_Regr))
forwardBIC <- step(mod,form_model,direction = "forward", k=log(n) )
Here is the model that i am using.
Start: AIC=181611.1
price ~ 1
Df Sum of Sq RSS AIC
+ sqft_living 1 5.5908e+16 6.9104e+16 178111
+ grade_int 1 4.2600e+16 8.2413e+16 179154
+ sqft_above 1 3.8988e+16 8.6024e+16 179407
+ view_dummy 1 1.5755e+16 1.0926e+17 180822
+ sqft_basement 1 1.1560e+16 1.1345e+17 181045
+ bed_int 1 1.0586e+16 1.1443e+17 181096
+ floors_dummy 1 8.6756e+15 1.1634e+17 181194
+ waterfront 1 8.1097e+15 1.1690e+17 181223
+ basement_dummy 1 3.8336e+15 1.2118e+17 181435
+ bath_int 1 2.1104e+15 1.2290e+17 181519
+ renovated_dummy 1 1.3665e+15 1.2365e+17 181555
+ years_since_renovations 1 8.6785e+14 1.2414e+17 181579
+ sqft_lot 1 8.2901e+14 1.2418e+17 181580
+ condition_dummy 1 6.4654e+14 1.2437e+17 181589
<none> 1.2501e+17 181611
+ age_of_house 1 1.7600e+14 1.2484e+17 181611
+ weekend_dummy 1 9.3267e+11 1.2501e+17 181620
Step: AIC=178111
price ~ sqft_living
Df Sum of Sq RSS AIC
+ view_dummy 1 4.7046e+15 6.4399e+16 177702
+ age_of_house 1 4.5059e+15 6.4598e+16 177721
+ waterfront 1 4.3957e+15 6.4708e+16 177731
+ grade_int 1 3.1890e+15 6.5915e+16 177840
+ years_since_renovations 1 3.0576e+15 6.6046e+16 177852
+ bed_int 1 1.7778e+15 6.7326e+16 177965
+ bath_int 1 1.7527e+15 6.7351e+16 177968
+ renovated_dummy 1 7.2312e+14 6.8381e+16 178057
+ basement_dummy 1 3.1144e+14 6.8793e+16 178093
+ sqft_above 1 1.6922e+14 6.8935e+16 178105
+ sqft_basement 1 1.6922e+14 6.8935e+16 178105
+ sqft_lot 1 1.2746e+14 6.8977e+16 178109
<none> 6.9104e+16 178111
+ condition_dummy 1 3.6244e+13 6.9068e+16 178117
+ floors_dummy 1 1.0259e+13 6.9094e+16 178119
+ weekend_dummy 1 5.9534e+12 6.9098e+16 178119
Here is a small output from the regression. I need to plot each steps BIC value using ggplot. My idea would be to just extract the BIC value for each step then plot them using ggplot but as i have said i have no idea how to accomplish this or if extracting the BIC is even necessary for ggplot.
How would i go about plotting the BIC for each step in the regression on ggplot?
I wouldn't recommend doing this usually, so if there is an answer using real functions then go for it. There is a function called in this: extractAIC that is storing the results, and then printing those tables. You can get the step function by typing it in the console. Quick scan showed me that in the variable aod inside this function it is storing the tables that it prints for each iteration.
A hacky way is to make a list inside this function, update the list with the table each time it changes and then either add it to the response (the usual way) or assign it out to the global environment (bad way). As I don't know anything about the class of the response of the step function, I've opted for the bad way. The full function is here. You can search for the # (!) addition flag to see where I've added it in.
The AIC column contains the BIC values. You can see it changes when you change the k value in the step call
Hope this works ok for you, I'm using the example in the step function
step2 <- function (object, scope, scale = 0, direction = c("both", "backward",
"forward"), trace = 1, keep = NULL, steps = 1000, k = 2,
...)
{
# (!) addition
aod.all <- list()
mydeviance <- function(x, ...) {
dev <- deviance(x)
if (!is.null(dev))
dev
else extractAIC(x, k = 0)[2L]
}
cut.string <- function(string) {
if (length(string) > 1L)
string[-1L] <- paste0("\n", string[-1L])
string
}
re.arrange <- function(keep) {
namr <- names(k1 <- keep[[1L]])
namc <- names(keep)
nc <- length(keep)
nr <- length(k1)
array(unlist(keep, recursive = FALSE), c(nr, nc), list(namr,
namc))
}
step.results <- function(models, fit, object, usingCp = FALSE) {
change <- sapply(models, "[[", "change")
rd <- sapply(models, "[[", "deviance")
dd <- c(NA, abs(diff(rd)))
rdf <- sapply(models, "[[", "df.resid")
ddf <- c(NA, diff(rdf))
AIC <- sapply(models, "[[", "AIC")
heading <- c("Stepwise Model Path \nAnalysis of Deviance Table",
"\nInitial Model:", deparse(formula(object)), "\nFinal Model:",
deparse(formula(fit)), "\n")
aod <- data.frame(Step = I(change), Df = ddf, Deviance = dd,
`Resid. Df` = rdf, `Resid. Dev` = rd, AIC = AIC,
check.names = FALSE)
if (usingCp) {
cn <- colnames(aod)
cn[cn == "AIC"] <- "Cp"
colnames(aod) <- cn
}
attr(aod, "heading") <- heading
fit$anova <- aod
fit
}
Terms <- terms(object)
object$call$formula <- object$formula <- Terms
md <- missing(direction)
direction <- match.arg(direction)
backward <- direction == "both" | direction == "backward"
forward <- direction == "both" | direction == "forward"
if (missing(scope)) {
fdrop <- numeric()
fadd <- attr(Terms, "factors")
if (md)
forward <- FALSE
}
else {
if (is.list(scope)) {
fdrop <- if (!is.null(fdrop <- scope$lower))
attr(terms(update.formula(object, fdrop)), "factors")
else numeric()
fadd <- if (!is.null(fadd <- scope$upper))
attr(terms(update.formula(object, fadd)), "factors")
}
else {
fadd <- if (!is.null(fadd <- scope))
attr(terms(update.formula(object, scope)), "factors")
fdrop <- numeric()
}
}
models <- vector("list", steps)
if (!is.null(keep))
keep.list <- vector("list", steps)
n <- nobs(object, use.fallback = TRUE)
fit <- object
bAIC <- extractAIC(fit, scale, k = k, ...)
edf <- bAIC[1L]
bAIC <- bAIC[2L]
if (is.na(bAIC))
stop("AIC is not defined for this model, so 'step' cannot proceed")
if (bAIC == -Inf)
stop("AIC is -infinity for this model, so 'step' cannot proceed")
nm <- 1
if (trace) {
cat("Start: AIC=", format(round(bAIC, 2)), "\n", cut.string(deparse(formula(fit))),
"\n\n", sep = "")
flush.console()
}
models[[nm]] <- list(deviance = mydeviance(fit), df.resid = n -
edf, change = "", AIC = bAIC)
if (!is.null(keep))
keep.list[[nm]] <- keep(fit, bAIC)
usingCp <- FALSE
while (steps > 0) {
steps <- steps - 1
AIC <- bAIC
ffac <- attr(Terms, "factors")
scope <- factor.scope(ffac, list(add = fadd, drop = fdrop))
aod <- NULL
change <- NULL
if (backward && length(scope$drop)) {
aod <- drop1(fit, scope$drop, scale = scale, trace = trace,
k = k, ...)
rn <- row.names(aod)
row.names(aod) <- c(rn[1L], paste("-", rn[-1L]))
if (any(aod$Df == 0, na.rm = TRUE)) {
zdf <- aod$Df == 0 & !is.na(aod$Df)
change <- rev(rownames(aod)[zdf])[1L]
}
}
if (is.null(change)) {
if (forward && length(scope$add)) {
aodf <- add1(fit, scope$add, scale = scale, trace = trace,
k = k, ...)
rn <- row.names(aodf)
row.names(aodf) <- c(rn[1L], paste("+", rn[-1L]))
aod <- if (is.null(aod))
aodf
else rbind(aod, aodf[-1, , drop = FALSE])
}
attr(aod, "heading") <- NULL
nzdf <- if (!is.null(aod$Df))
aod$Df != 0 | is.na(aod$Df)
aod <- aod[nzdf, ]
if (is.null(aod) || ncol(aod) == 0)
break
nc <- match(c("Cp", "AIC"), names(aod))
nc <- nc[!is.na(nc)][1L]
o <- order(aod[, nc])
# (!) addition
aod.all <- c(aod.all, list(aod))
if (trace)
print(aod[o, ])
if (o[1L] == 1)
break
change <- rownames(aod)[o[1L]]
}
usingCp <- match("Cp", names(aod), 0L) > 0L
fit <- update(fit, paste("~ .", change), evaluate = FALSE)
fit <- eval.parent(fit)
nnew <- nobs(fit, use.fallback = TRUE)
if (all(is.finite(c(n, nnew))) && nnew != n)
stop("number of rows in use has changed: remove missing values?")
Terms <- terms(fit)
bAIC <- extractAIC(fit, scale, k = k, ...)
edf <- bAIC[1L]
bAIC <- bAIC[2L]
if (trace) {
cat("\nStep: AIC=", format(round(bAIC, 2)), "\n",
cut.string(deparse(formula(fit))), "\n\n", sep = "")
flush.console()
}
if (bAIC >= AIC + 1e-07)
break
nm <- nm + 1
models[[nm]] <- list(deviance = mydeviance(fit), df.resid = n -
edf, change = change, AIC = bAIC)
if (!is.null(keep))
keep.list[[nm]] <- keep(fit, bAIC)
}
if (!is.null(keep))
fit$keep <- re.arrange(keep.list[seq(nm)])
# (!) addition
assign("aod.all", aod.all, envir = .GlobalEnv)
step.results(models = models[seq(nm)], fit, object, usingCp)
}
lm1 <- lm(Fertility ~ ., data = swiss)
slm1 <- step2(lm1)
aod.all
# Function to construct a mortality demogdata object from HMD
hmd.mx <- function(country, username, password, label=country){
path <- paste("https://www.mortality.org/hmd/", country, "/STATS/", "Mx_1x1.txt", sep = "")
}
userpwd <- paste(username, ":", password, sep = "")
txt <- RCurl::getURL(path, userpwd = userpwd)
con <- textConnection(txt)
mx <- try(utils::read.table(con, skip = 2, header = TRUE, na.strings = "."),TRUE)
close(con)
if(class(mx)=="try-error")
stop("Connection error at www.mortality.org. Please check username, password and country label.")
path <- paste("https://www.mortality.org/hmd/", country, "/STATS/", "Exposures_1x1.txt", sep = "")
userpwd <- paste(username, ":", password, sep = "")
txt <- RCurl::getURL(path, userpwd = userpwd)
con <- textConnection(txt)
pop <- try(utils::read.table(con, skip = 2, header = TRUE, na.strings = "."),TRUE)
close(con)
if(class(pop)=="try-error")
stop("Exposures file not found at www.mortality.org")
obj <- list(type="mortality",label=label,lambda=0)
obj$year <- sort(unique(mx[, 1]))
#obj$year <- ts(obj$year, start=min(obj$year))
n <- length(obj$year)
m <- length(unique(mx[, 2]))
obj$age <- mx[1:m, 2]
obj$rate <- obj$pop <- list()
for (i in 1:n.mort)
{ obj$rate[[i]] <- matrix(mx[, i + 2], nrow = m, ncol = n)
obj$rate[[i]][obj$rate[[i]] < 0] <- NA
obj$pop[[i]] <- matrix(pop[, i + 2], nrow = m, ncol = n)
obj$pop[[i]][obj$pop[[i]] < 0] <- NA
dimnames(obj$rate[[i]]) <- dimnames(obj$pop[[i]]) <- list(obj$age, obj$year)
}
names(obj$pop) = names(obj$rate) <- tolower(mnames)
obj$age <- as.numeric(as.character(obj$age))
if (is.na(obj$age[m])) {
obj$age[m] <- 2 * obj$age[m - 1] - obj$age[m - 2] }
return(structure(obj, class = "demogdata"))
}
Above is the code that we are using to import our population data into r.
NLdata <- hmd.mx(country = "NLD",username = "username",password="password")
This would be the specific code to obtain the Dutch data.
Would anyone happen to know how to add multiple countries into one, and put that data into one dataframe (same format as the demography data packages that we download)? So for example the mortality rates for the (Netherlands + France + Norway) / 3 into one package.
You can try this code. However I could not run your demography package. So you might need to edit the code a bit. Perhaps someone else can fill in the second part? I saw that no one has reacted yet.
C1 <- data.frame(Year = 1980:2018, value1 = rnorm(39), value2 = rnorm(39), Cat =rbinom(39,1,0.5), Country = "France")
C2 <- data.frame(Year = 1980:2018, value1 = rnorm(39), value2 = rnorm(39), Cat =rbinom(39,1,0.5),Country = "England")
C3 <- data.frame(Year = 1970:2018, value1 = rnorm(49), value2 = rnorm(49), Cat =rbinom(49,1,0.5),Country = "Netherlands")
C1 <- split(C1, C1$Cat)
C2 <- split(C2, C2$Cat)
C3 <- split(C3, C3$Cat)
list_all <- list(rbind(C1[[1]],C2[[1]],C3[[1]]),rbind(C1[[2]],C2[[2]],C3[[2]]))
Final_list <- lapply(list_all, function(x) x %>% group_by(Year) %>% summarise(Val1 = mean(value1), Val2 = mean(value2), Country = "All") %>% as.data.frame)