Error in model.frame.default(formula ...) - r

I'm trying to make an R script to allow users to input a dataset and then display the predictive rate parity graph for their corresponding dataset. I have most of the code but when I attempt to test it with a dataset, I receive an error.
The code is below:
library(mltools)
library(fairness)
library(dplyr)
library(data.table)
calculate_fairness_metric <- function(newdata, target, sensitive_attr, base) {
set.seed(77)
val_percent <- 0.2
val_idx <- sample(1:nrow(new_data))[1:round(nrow(new_data) * val_percent)]
df_train <- new_data[-val_idx, ]
df_valid <- new_data[ val_idx, ]
model1 <- glm(target ~ .,
data = df_train,
family = binomial(link = 'logit'))
df_valid$prob_1 <- predict(model1, df_valid, type = 'response')
res1 <- pred_rate_parity(data = df_valid,
outcome = target,
outcome_base = '0',
group = sensitive_attr,
probs = 'prob_1',
cutoff = 0.5,
base = base)
return(res1$Metric)
}
calculate_fairness_metric(revised, "readmitted", "race", "Caucasian")
Error in model.frame.default(formula = target ~ ., data = df_train, drop.unused.levels = TRUE) :
variable lengths differ (found for 'race')
The dataset I used is below:
dataset image

Related

tbl_regression(), plm, and mice - Error: Tibble columns must have compatible sizes

I am trying to print a regression model using tbl_regression() on a plm object with multiply imputed data. I've found that I can print the regression table if the plm has one independent variable, but not if it has two or more independent variables.
I understand that the below error message is common, but I don't understand what it means in the context of tbl_regression and multiply imputed data. Is there a bug in gtsummary, or is something wrong with my code?
library(mice, warn.conflicts = FALSE)
library(mitools)
library(missMethods)
library(plm)
library(gtsummary)
options(scipen=999)
set.seed(12345)
data("Grunfeld")
df <- delete_MCAR(Grunfeld, p = 0.3, cols_mis = c(3:5))
imp <- mice::mice(df, m = 5, print = FALSE)
implist <- imputationList(
lapply(1:imp$m, function(n) mice::complete(imp, action = n)))
fit1 <- lapply(implist$imputations, function(x){ plm(inv ~ value, data = x, model = "within", index = c("firm", "year"))})
#tbl_regression(as.mira(fit1)) # works
fit2 <- lapply(implist$imputations, function(x){ plm(inv ~ value + capital, data = x, model = "within", index = c("firm", "year"))})
tbl_regression(as.mira(fit2)) # does not work
#> pool_and_tidy_mice(): Tidying mice model with
#> `mice::pool(x) %>% mice::tidy(exponentiate = FALSE, conf.int = TRUE, conf.level = 0.95)`
#> Error: Tibble columns must have compatible sizes.
#> * Size 2: Existing data.
#> * Size 3: Column `variable`.
#> ℹ Only values of size one are recycled.
Thank you! This code now works under the current version of gtsummary.
library(mice, warn.conflicts = FALSE)
library(mitools)
library(missMethods)
library(plm)
library(gtsummary)
options(scipen=999)
set.seed(12345)
mice::version(pkg = "gtsummary")
#> [1] "gtsummary 1.5.1.9001 /Library/Frameworks/R.framework/Versions/4.1/Resources/library"
data("Grunfeld")
df <- delete_MCAR(Grunfeld, p = 0.3, cols_mis = c(3:5))
imp <- mice::mice(df, m = 5, print = FALSE)
implist <- imputationList(
lapply(1:imp$m, function(n) mice::complete(imp, action = n)))
fit2 <- lapply(implist$imputations, function(x){ plm(inv ~ value + capital, data = x, model = "within", index = c("firm", "year"))})
tbl_regression(as.mira(fit2))
#> pool_and_tidy_mice(): Tidying mice model with
#> `mice::pool(x) %>% mice::tidy(exponentiate = FALSE, conf.int = TRUE, conf.level = 0.95)`

How can I find regression model analyses from 2 dataset?

setwd("C:/Users/sevvalayse.yurtekin/Desktop/hw3")
data = read.table('DSE501_fall2020_HW3.csv', header= T, sep=',')
attach
data
getOption("max.print")
rs<-rowSums(data[,2:76], na.rm = TRUE)
data<-cbind(data,rs)
data
p1<-ggplot()+
geom_line(aes(y = rs, x=year), data=data)+
scale_x_continuous(breaks = seq(2004,2019,2))
p1
model = lm(rs ~ year )
model
summary(model)
residuals(model)
predict(model)
#model.fit = lm(year~rs)
#summary(model.fit)
new.year<-data.frame(
year = c(2021,2022,2023)
)
predict(model, newdata = new.year, interval = 'confidence')
data2 = read.table('TUIK_nufus_2019.csv', header = T, sep=",")
data2
total = data2$Total
mydata<-data[-c(1,2,3),]
model2 = lm(mydata~total)
model2
Hello, I have an error about the Error in model.frame.default(formula = mydata ~ total, drop.unused.levels = TRUE) : invalid type (list) for variable 'mydata'.
How can I fixed? I want to regression analyses from 2 data.
The line that's causing the issue is model2 = lm(mydata~total). mydata is not a vector, which is what your dependent variable should be in the lm function. When you set mydata you do not provide a column name: mydata<-data[-c(1,2,3), <enter column name of dependent variable>]
Otherwise you can fit your model with the following syntax (provided your dependent and independent variables are in the same dataframe). Here I just used y as a fake variable name: lm(y ~ total, data = mydata)

Select tuneGrid depending on the model in caret R

I try to apply ML on the iris dataset, using "knn" and "rpart" algorithms. This is my code:
library(tidyverse)
library(caret)
dataset <- iris
tt_index <- createDataPartition(dataset$Sepal.Length, times = 1, p = 0.9, list = FALSE)
train_set <- dataset[tt_index, ]
test_set <- dataset[-tt_index, ]
models <- c("knn","rpart")
fits <- lapply(models, function(model){
print(model)
train(Species ~ .,
data = train_set,
tuneGrid = case_when(model == "knn" ~ data.frame(k = seq(3,50,1)),
model == "rpart" ~ data.frame(cp = seq(0,0.1,len = 50))),
method = model)
})
I want to set tuneGrid parameter depending on the model inside lapply. But I receive this error:
Error in `[.data.frame`(value[[1]], rep(NA_integer_, m)) :
undefined columns selected
Any help will be greatly appreciated.
We could use if/else
library(caret)
out <- lapply(models, function(model)
train(Species ~ ., data = train_set,
tuneGrid = if(model == "knn") data.frame(k = seq(3,50,1)) else
data.frame(cp = seq(0,0.1,len = 50)), method = model))
According to ?case_when
A vector of length 1 or n, matching the length of the logical input or output vectors, with the type (and attributes) of the first RHS. Inconsistent lengths or types will generate an error.

Predict(), NewData with two column and differing rows

I am trying to make the prediction of three variables (retweets,media,content) in my dataset (df_22) to choose between Poisson, Negative binomial and Zero-inflated Poisson. One of the three variables is the response variable (retweets) and the other two the predictive variables (media,content).
I realize the generalized linear models and without problem.
Zero-inflated Poisson data
library("pscl")
summary( m0 <- zeroinfl(retweets ~ media + content, data=df_22,dist="poisson") )
Poisson
summary( m1 <- glm(formula=retweets ~ media + content, data=df_22, family="poisson"(link=log)))
Negative binomial
library (MASS)
summary( m2 <- glm.nb(retweets ~ media + content, data=df_22) )
However, when I create the new database to make the prediction. I check it levels.
> levels(df_22$media)
[1] "other" "pic" "pw" "text" "web"
> levels(df_22$content)
[1] "cultura" "employ" "environment" "other" "security" "sport" "transport"
I have a problem. And it is that the rows of both columns is different.
newmedia = c("other","pic","pw","text", "web")
newcontent = c("cultura","employ","environment","other","security","sport","transport")
nd = data.frame(media = newmedia, content = newcontent)
Error in data.frame(media = newmedia, content = newcontent) : arguments imply differing number of rows: 5, 7
What should I do to solve these problems?
I want to solve this problem in order to be able to make these predictions so that I can choose which of the three models is better for my data.
p0 <- cbind(nd, Count = predict(m0, newdata = nd, type = "count"), Zero = predict(m0, newdata = nd, type = "zero"))
p1 <- cbind(nd, Mean = predict(m1, newdata = nd, type="response"), SE = predict(m1, newdata = nd, type="response", se.fit=T)$se.fit)
p2 <- cbind(nd, Mean = predict(m2, newdata = nd, type="response"), SE = predict(m2, newdata = nd, type="response", se.fit=T)$se.fit)
In the code below a sample data set is created and it computes the p0, p1, p2. The nb dataframe was created differently as a test dataframe.
Import libraries
library(pscl)
library (MASS)
Create sample data set
media <- c("other", "pic", "pw", "text", "web")
content <- c("cultura", "employ", "environment", "other", "security", "sport", "transport")
set.seed(1)
retweets <- floor(abs(1e4*rnorm(1000)))
temp_index <- which(retweets %in% sample(retweets, 20)) # sample indexes
retweets[temp_index] <- 0 # set some retweets to zero to run zeroinfl()
df <- data.frame(retweets)
df$media <- sample(media, 1000, replace = TRUE)
df$content <- sample(content, 1000, replace = TRUE)
head(df)
unique(df$media)
unique(df$content)
Create a test data set
Note: Here, test data set is drawn from the training data for illustration purpose only. Ideally, it should be a new set of data.
nd = df[sample(nrow(df), 300), ] # ideally this should not be from the train data, this is just for an example code
nd_X <- test[,c('media', 'content')]
nd_Y <- test[,c('retweets')]
Fit models: zeroinf(dist='poisson'), glm(family='poisson'), glm.nb()
# Poisson
summary( m0 <- zeroinfl(retweets ~ media + content, data=df, dist="poisson") )
# Binomial
summary( m1 <- glm(formula=retweets ~ media + content, data=df, family="poisson"(link=log)))
# glm()
#summary( m2 <- glm.nb(retweets ~ media + content, data=df) ) # gives error in summary due to zeros
summary( m2 <- glm.nb(retweets ~ media + content, data=df[df$retweets!=0,]) ) # no error without zeros
Predict using test data set
p0 <- cbind(nd, Count = predict(m0, newdata = nd_X, type = "count"), Zero = predict(m0, newdata = nd, type = "zero"))
p1 <- cbind(nd, Mean = predict(m1, newdata = nd_X, type="response"), SE = predict(m1, newdata = nd, type="response", se.fit=T)$se.fit)
p2 <- cbind(nd, Mean = predict(m2, newdata = nd_X, type="response"), SE = predict(m2, newdata = nd, type="response", se.fit=T)$se.fit)
Output:

Variable Lengths Differ Error

Throughout my function I have arguments z and y
I want z to be equal to a data set (for example birthwt) and y to be equal to a response variable (for example birthwt$low)
library("MASS")
library("dplyr")
data(birthwt)
foo=function(z,y){
n.folds <- 10
folds <- cut(sample(seq_len(nrow(z))), breaks=n.folds, labels=FALSE)
all.confusion.tables <- list()
for (i in seq_len(n.folds)) {
train <- filter(z, folds != i)
test <- filter(z, folds == i)
glm.train <- glm(y ~.,family = binomial, data = train)
mod_pred_probs =predict(glm.train,test, type= "response")
pred.class <- ifelse(mod_pred_probs< 0, 0, 1)
all.confusion.tables[[i]] <- table(pred = pred.class, true = test$y)
}
misclassrisk <- function(x) { (sum(x) - sum(diag(x)))/sum(x) }
risk <- sapply(all.confusion.tables, misclassrisk)
return(table(risk))
mean(risk)}
When I run foo(birtht,"low")
I get the error:
Error in model.frame.default(formula = y ~ ., data = train, drop.unused.levels = TRUE) :
variable lengths differ (found for 'low')
Does any one know why I am getting the error or how I can avoid it?

Resources