I'm trying to make an R script to allow users to input a dataset and then display the predictive rate parity graph for their corresponding dataset. I have most of the code but when I attempt to test it with a dataset, I receive an error.
The code is below:
library(mltools)
library(fairness)
library(dplyr)
library(data.table)
calculate_fairness_metric <- function(newdata, target, sensitive_attr, base) {
set.seed(77)
val_percent <- 0.2
val_idx <- sample(1:nrow(new_data))[1:round(nrow(new_data) * val_percent)]
df_train <- new_data[-val_idx, ]
df_valid <- new_data[ val_idx, ]
model1 <- glm(target ~ .,
data = df_train,
family = binomial(link = 'logit'))
df_valid$prob_1 <- predict(model1, df_valid, type = 'response')
res1 <- pred_rate_parity(data = df_valid,
outcome = target,
outcome_base = '0',
group = sensitive_attr,
probs = 'prob_1',
cutoff = 0.5,
base = base)
return(res1$Metric)
}
calculate_fairness_metric(revised, "readmitted", "race", "Caucasian")
Error in model.frame.default(formula = target ~ ., data = df_train, drop.unused.levels = TRUE) :
variable lengths differ (found for 'race')
The dataset I used is below:
dataset image
Related
I am trying to print a regression model using tbl_regression() on a plm object with multiply imputed data. I've found that I can print the regression table if the plm has one independent variable, but not if it has two or more independent variables.
I understand that the below error message is common, but I don't understand what it means in the context of tbl_regression and multiply imputed data. Is there a bug in gtsummary, or is something wrong with my code?
library(mice, warn.conflicts = FALSE)
library(mitools)
library(missMethods)
library(plm)
library(gtsummary)
options(scipen=999)
set.seed(12345)
data("Grunfeld")
df <- delete_MCAR(Grunfeld, p = 0.3, cols_mis = c(3:5))
imp <- mice::mice(df, m = 5, print = FALSE)
implist <- imputationList(
lapply(1:imp$m, function(n) mice::complete(imp, action = n)))
fit1 <- lapply(implist$imputations, function(x){ plm(inv ~ value, data = x, model = "within", index = c("firm", "year"))})
#tbl_regression(as.mira(fit1)) # works
fit2 <- lapply(implist$imputations, function(x){ plm(inv ~ value + capital, data = x, model = "within", index = c("firm", "year"))})
tbl_regression(as.mira(fit2)) # does not work
#> pool_and_tidy_mice(): Tidying mice model with
#> `mice::pool(x) %>% mice::tidy(exponentiate = FALSE, conf.int = TRUE, conf.level = 0.95)`
#> Error: Tibble columns must have compatible sizes.
#> * Size 2: Existing data.
#> * Size 3: Column `variable`.
#> ℹ Only values of size one are recycled.
Thank you! This code now works under the current version of gtsummary.
library(mice, warn.conflicts = FALSE)
library(mitools)
library(missMethods)
library(plm)
library(gtsummary)
options(scipen=999)
set.seed(12345)
mice::version(pkg = "gtsummary")
#> [1] "gtsummary 1.5.1.9001 /Library/Frameworks/R.framework/Versions/4.1/Resources/library"
data("Grunfeld")
df <- delete_MCAR(Grunfeld, p = 0.3, cols_mis = c(3:5))
imp <- mice::mice(df, m = 5, print = FALSE)
implist <- imputationList(
lapply(1:imp$m, function(n) mice::complete(imp, action = n)))
fit2 <- lapply(implist$imputations, function(x){ plm(inv ~ value + capital, data = x, model = "within", index = c("firm", "year"))})
tbl_regression(as.mira(fit2))
#> pool_and_tidy_mice(): Tidying mice model with
#> `mice::pool(x) %>% mice::tidy(exponentiate = FALSE, conf.int = TRUE, conf.level = 0.95)`
setwd("C:/Users/sevvalayse.yurtekin/Desktop/hw3")
data = read.table('DSE501_fall2020_HW3.csv', header= T, sep=',')
attach
data
getOption("max.print")
rs<-rowSums(data[,2:76], na.rm = TRUE)
data<-cbind(data,rs)
data
p1<-ggplot()+
geom_line(aes(y = rs, x=year), data=data)+
scale_x_continuous(breaks = seq(2004,2019,2))
p1
model = lm(rs ~ year )
model
summary(model)
residuals(model)
predict(model)
#model.fit = lm(year~rs)
#summary(model.fit)
new.year<-data.frame(
year = c(2021,2022,2023)
)
predict(model, newdata = new.year, interval = 'confidence')
data2 = read.table('TUIK_nufus_2019.csv', header = T, sep=",")
data2
total = data2$Total
mydata<-data[-c(1,2,3),]
model2 = lm(mydata~total)
model2
Hello, I have an error about the Error in model.frame.default(formula = mydata ~ total, drop.unused.levels = TRUE) : invalid type (list) for variable 'mydata'.
How can I fixed? I want to regression analyses from 2 data.
The line that's causing the issue is model2 = lm(mydata~total). mydata is not a vector, which is what your dependent variable should be in the lm function. When you set mydata you do not provide a column name: mydata<-data[-c(1,2,3), <enter column name of dependent variable>]
Otherwise you can fit your model with the following syntax (provided your dependent and independent variables are in the same dataframe). Here I just used y as a fake variable name: lm(y ~ total, data = mydata)
I try to apply ML on the iris dataset, using "knn" and "rpart" algorithms. This is my code:
library(tidyverse)
library(caret)
dataset <- iris
tt_index <- createDataPartition(dataset$Sepal.Length, times = 1, p = 0.9, list = FALSE)
train_set <- dataset[tt_index, ]
test_set <- dataset[-tt_index, ]
models <- c("knn","rpart")
fits <- lapply(models, function(model){
print(model)
train(Species ~ .,
data = train_set,
tuneGrid = case_when(model == "knn" ~ data.frame(k = seq(3,50,1)),
model == "rpart" ~ data.frame(cp = seq(0,0.1,len = 50))),
method = model)
})
I want to set tuneGrid parameter depending on the model inside lapply. But I receive this error:
Error in `[.data.frame`(value[[1]], rep(NA_integer_, m)) :
undefined columns selected
Any help will be greatly appreciated.
We could use if/else
library(caret)
out <- lapply(models, function(model)
train(Species ~ ., data = train_set,
tuneGrid = if(model == "knn") data.frame(k = seq(3,50,1)) else
data.frame(cp = seq(0,0.1,len = 50)), method = model))
According to ?case_when
A vector of length 1 or n, matching the length of the logical input or output vectors, with the type (and attributes) of the first RHS. Inconsistent lengths or types will generate an error.
I am trying to make the prediction of three variables (retweets,media,content) in my dataset (df_22) to choose between Poisson, Negative binomial and Zero-inflated Poisson. One of the three variables is the response variable (retweets) and the other two the predictive variables (media,content).
I realize the generalized linear models and without problem.
Zero-inflated Poisson data
library("pscl")
summary( m0 <- zeroinfl(retweets ~ media + content, data=df_22,dist="poisson") )
Poisson
summary( m1 <- glm(formula=retweets ~ media + content, data=df_22, family="poisson"(link=log)))
Negative binomial
library (MASS)
summary( m2 <- glm.nb(retweets ~ media + content, data=df_22) )
However, when I create the new database to make the prediction. I check it levels.
> levels(df_22$media)
[1] "other" "pic" "pw" "text" "web"
> levels(df_22$content)
[1] "cultura" "employ" "environment" "other" "security" "sport" "transport"
I have a problem. And it is that the rows of both columns is different.
newmedia = c("other","pic","pw","text", "web")
newcontent = c("cultura","employ","environment","other","security","sport","transport")
nd = data.frame(media = newmedia, content = newcontent)
Error in data.frame(media = newmedia, content = newcontent) : arguments imply differing number of rows: 5, 7
What should I do to solve these problems?
I want to solve this problem in order to be able to make these predictions so that I can choose which of the three models is better for my data.
p0 <- cbind(nd, Count = predict(m0, newdata = nd, type = "count"), Zero = predict(m0, newdata = nd, type = "zero"))
p1 <- cbind(nd, Mean = predict(m1, newdata = nd, type="response"), SE = predict(m1, newdata = nd, type="response", se.fit=T)$se.fit)
p2 <- cbind(nd, Mean = predict(m2, newdata = nd, type="response"), SE = predict(m2, newdata = nd, type="response", se.fit=T)$se.fit)
In the code below a sample data set is created and it computes the p0, p1, p2. The nb dataframe was created differently as a test dataframe.
Import libraries
library(pscl)
library (MASS)
Create sample data set
media <- c("other", "pic", "pw", "text", "web")
content <- c("cultura", "employ", "environment", "other", "security", "sport", "transport")
set.seed(1)
retweets <- floor(abs(1e4*rnorm(1000)))
temp_index <- which(retweets %in% sample(retweets, 20)) # sample indexes
retweets[temp_index] <- 0 # set some retweets to zero to run zeroinfl()
df <- data.frame(retweets)
df$media <- sample(media, 1000, replace = TRUE)
df$content <- sample(content, 1000, replace = TRUE)
head(df)
unique(df$media)
unique(df$content)
Create a test data set
Note: Here, test data set is drawn from the training data for illustration purpose only. Ideally, it should be a new set of data.
nd = df[sample(nrow(df), 300), ] # ideally this should not be from the train data, this is just for an example code
nd_X <- test[,c('media', 'content')]
nd_Y <- test[,c('retweets')]
Fit models: zeroinf(dist='poisson'), glm(family='poisson'), glm.nb()
# Poisson
summary( m0 <- zeroinfl(retweets ~ media + content, data=df, dist="poisson") )
# Binomial
summary( m1 <- glm(formula=retweets ~ media + content, data=df, family="poisson"(link=log)))
# glm()
#summary( m2 <- glm.nb(retweets ~ media + content, data=df) ) # gives error in summary due to zeros
summary( m2 <- glm.nb(retweets ~ media + content, data=df[df$retweets!=0,]) ) # no error without zeros
Predict using test data set
p0 <- cbind(nd, Count = predict(m0, newdata = nd_X, type = "count"), Zero = predict(m0, newdata = nd, type = "zero"))
p1 <- cbind(nd, Mean = predict(m1, newdata = nd_X, type="response"), SE = predict(m1, newdata = nd, type="response", se.fit=T)$se.fit)
p2 <- cbind(nd, Mean = predict(m2, newdata = nd_X, type="response"), SE = predict(m2, newdata = nd, type="response", se.fit=T)$se.fit)
Output:
Throughout my function I have arguments z and y
I want z to be equal to a data set (for example birthwt) and y to be equal to a response variable (for example birthwt$low)
library("MASS")
library("dplyr")
data(birthwt)
foo=function(z,y){
n.folds <- 10
folds <- cut(sample(seq_len(nrow(z))), breaks=n.folds, labels=FALSE)
all.confusion.tables <- list()
for (i in seq_len(n.folds)) {
train <- filter(z, folds != i)
test <- filter(z, folds == i)
glm.train <- glm(y ~.,family = binomial, data = train)
mod_pred_probs =predict(glm.train,test, type= "response")
pred.class <- ifelse(mod_pred_probs< 0, 0, 1)
all.confusion.tables[[i]] <- table(pred = pred.class, true = test$y)
}
misclassrisk <- function(x) { (sum(x) - sum(diag(x)))/sum(x) }
risk <- sapply(all.confusion.tables, misclassrisk)
return(table(risk))
mean(risk)}
When I run foo(birtht,"low")
I get the error:
Error in model.frame.default(formula = y ~ ., data = train, drop.unused.levels = TRUE) :
variable lengths differ (found for 'low')
Does any one know why I am getting the error or how I can avoid it?