Passing data frame fields to a regression model in a function - r

I am just a few months into using R and this is my first post. I am looking to create a function that takes fields from a data frame, filters the outliers via quantiles, then writes the regression parameters as annotations on a scatter plot. The filtering and plotting work correctly but I get an error in the linear model. Can I convert those fields to execute in the model?
Error message:
Error in model.frame.default(formula = df[, field1] ~ df[, field2], drop.unused.levels = TRUE) :
invalid type (list) for variable 'df[, field1]'
Here is the function:
scatter_filtered <- function(df,field1,field2,field3) {
range1 <- quantile(df[, field1], probs= c(0.1,0.9), na.rm=TRUE)
range2 <- quantile(df[, field2], probs= c(0.1,0.9), na.rm=TRUE)
low_end1 <- range1[1]
high_end1 <- range1[2]
low_end2 <- range2[1]
high_end2 <- range2[2]
df %>%
filter(df[, field1] > low_end1, df[, field1] < high_end1,
df[, field2] > low_end2, df[, field2] < high_end2) %>%
model <- lm(df[,field1] ~ df[, field2])
r_output <- round(glance(model)$r.squared, digits = 5)
r_adj_output <- round(glance(model)$adj.r.squared, digits = 5)
p_output <- round(glance(model)$p.value, digits = 5) %>%
ggplot(aes_string(x = field1, y = field2, color = field3)) +
geom_point() +
geom_smooth(method="lm", se=FALSE)
# annotate("text", label = paste("r_sq:",r_output), x=0.1, y=0.1, parse=TRUE) +
# annotate("text", label = paste("p-val:",p_output), x=0.1, y=0.1, parse=TRUE)
}

Related

Error in FUN(X[[i]], ...) : object 'predicted' not found

i try to search about this problem but i couldn't find,that i try to apply ggplot for the relation between the two variables (predictor and predicted x,y), the method was linear regression model (lm) but i got this error
Error in FUN(X[[i]], ...) : object 'predicted' not found
and this my code:
# Install tidymodels if you haven't done so
install.packages("rlang")
install.packages("tidymodels")
install.packages("dplyr")
# Library for modeling
library(tidymodels)
# Load tidyverse
library(tidyverse)
library(dplyr)
URL <- 'https://dax-cdn.cdn.appdomain.cloud/dax-noaa-weather-data-jfk-airport/1.1.4/noaa-weather-sample-data.tar.gz'
download.file (URL, destfile='noaa-weather-sample-data.tar.gz')
untar('noaa-weather-sample-data.tar.gz',tar = 'internal')
dataset<- read.csv ('noaa-weather-sample-data/jfk_weather_sample.csv')
head(dataset)
glimpse(dataset)
subset_data <- data.frame(dataset$HOURLYRelativeHumidity,dataset$HOURLYDRYBULBTEMPF,dataset$HOURLYStationPressure,dataset$HOURLYWindSpeed,dataset$HOURLYPrecip)
subset_data<-setNames(subset_data,c('HOURLYRelativeHumidity','HOURLYDRYBULBTEMPF','HOURLYStationPressure','HOURLYWindSpeed', 'HOURLYPrecip'))
head(subset_data,10)
unique(subset_data$HOURLYPrecip)
subset_data <- subset_data %>%mutate(HOURLYPrecip = replace(HOURLYPrecip, HOURLYPrecip == "T", "0.0"))
View(subset_data)
install.packages('stringr ')
library(stringr)
subset_data$HOURLYPrecip <- str_remove(subset_data$HOURLYPrecip, pattern = 's')
unique(subset_data$HOURLYPrecip)
subset_data$HOURLYPrecip <- as.numeric(subset_data$HOURLYPrecip)
subset_data$HOURLYRelativeHumidity <- as.numeric(subset_data$HOURLYRelativeHumidity)
subset_data$HOURLYDRYBULBTEMPF <- as.numeric(subset_data$HOURLYDRYBULBTEMPF)
subset_data$HOURLYWindSpeed <- as.numeric(subset_data$HOURLYWindSpeed)
str(subset_data1)
subset_data1 <- setNames(subset_data,c('relative_humidity','dry_bulb_temp_f','station_pressure','wind_speed', 'precip'))
#library(rsample)
set.seed(1234)
#split_data <- initial_split(subset_data1)
dt<- sort(sample(nrow(subset_data1), nrow(subset_data1)*.8))
train_data<- subset_data1 [dt,]
test_data <- subset_data1 [-dt,]
install.packages('ggplot')
library(ggplot2)
hist(train_data$relative_humidity)
hist(train_data$dry_bulb_temp_f)
hist(train_data$station_pressure)
hist(train_data$wind_speed)
hist(train_data$precip)
install.packages('stats')
library(stats)
linear_reg1 <- lm (precip ~ relative_humidity, train_data)
linear_reg2 <- lm (precip ~ dry_bulb_temp_f, train_data)
linear_reg3 <- lm (precip ~ wind_speed, train_data)
linear_reg4 <- lm (precip ~ station_pressure, train_data)
#ggplot(train_data, aes(x = train_data$relative_humidity ,y= train_data$precip)) + geom_point()+ stat_smooth(method = 'lm', col ='red')
ggplot(train_data, aes(x= relative_humidity, y= precip))+
geom_smooth(method='lm',se=FALSE ,color='red')+
geom_segment(aes(xend= relative_humidity ,yend = predicted),alpha=.2)+
geom_point()+
geom_point(aes(y = predicted),shape=1)+
theme_bw()
You simply don't have a column called predicted in your data set. Presumably you want this to be the predicted value of precipitation based on humidity. In that case, you can easily create the column in your data frame using your first linear model:
train_data$predicted <- predict(linear_reg1, train_data)
ggplot(train_data, aes(x = relative_humidity, y = precip)) +
geom_segment(aes(xend = relative_humidity, yend = predicted), alpha = 0.2) +
geom_point(alpha = 0.1) +
geom_smooth(method='lm', se = FALSE , color = 'red') +
geom_point(aes(y = predicted), shape = 1, alpha = 0.1) +
theme_bw()
As you can see, a linear model is no good here. It predicts negative precipitation at low humidity and underestimates the precipitation at high humidity.

ggplot - use data passed to ggplot to calculate the mean of the data in subsequent geom calls [duplicate]

I was wondering why variable mean_y is not recognized by my
geom_hline(yintercept = unique(mean_y)) call?
library(tidyverse)
set.seed(20)
n_groups <- 2
n_in_group <- 20
sd_e = 2
groups <- gl(n_groups, n_in_group, labels = c("T","C"))
age <-rnorm(length(groups), 25, 3)
betas <- c(5,0,0,2)
dat <- data.frame(groups=groups,age=age)
X <- model.matrix(~ groups * age, data = dat)
lin_pred <- as.vector(X %*% betas)
dat$y <- rnorm(nrow(X), lin_pred, sd_e)
dat %>% group_by(groups) %>% mutate(mean_y = mean(y)) %>%
ungroup() %>%
ggplot()+aes(x = age, y = y) +
geom_point(aes(color=groups)) +
geom_hline(yintercept = unique(mean_y)) # Error in unique(mean_y) :
# object 'mean_y' not found
Variables need to be inside aes(), try:
geom_hline(aes(yintercept = mean_y))

Looping ggplot2 multiple linear regression

I am trying to loop my multiple linear regression plot and summaries, but I keep encountering an error in R that states Error: More than one expression parsed. I am not sure how to fix this or if there is a better way to achieve what I want to do which is mainly:
Plot a multiple linear regression plot with Group as the colour
Get summary for each of the linear regression lines based on Group
Compute regression summary
Perform anova to determine differences
colNames <- names(df)[c(35:39)]
for(i in colNames){
plt <- ggplot(df,
aes_string(x=df$MachineLength, y=i, fill=df$Group, color=be_nlyl$Group)) +
geom_smooth(method=lm) +
geom_point(size = 2, alpha=0.7) +
labs(title="Machine", subtitle = "Machine Type") +
theme_bw() +
theme(plot.title = element_text(hjust=0.5, face="bold"),
plot.subtitle = element_text(hjust=0.5))
print(plt)
lm_A <- lm(formula = i ~ MachineLength, data = subset(be_nlyl, Group == "A"))
summary(lm_A) %>% print()
lm_B <- lm(formula = i ~ MachineLength, data = subset(be_nlyl, Group == "B"))
summary(lm_B) %>% print()
clz.lm <- lm(formula = i ~ Group + MachineLength + Group:MachineLength, data = df)
summary(clz.lm) %>% print()
ano.lm <- Anova(lm(i ~ MachineLength*Group, data = df))
print(ano.lm)
}
Anyone have ideas of how to implement above? Thank you!
Try the following :
Create lists of length colNames to store all the outputs so that instead of just printing the output we can store them as well.
Use for loop over the index of colNames instead of actual column names so that you can use that as an index to store the output for different objects.
aes_string has be deprecated so we use .data pronoun to pass column name as variable.
Use sprintf to create formula string which is passed in lm function.
library(ggplot2)
colNames <- names(df)[c(35:39)]
plt <- vector('list', length(colNames))
lm_A <- vector('list', length(colNames))
summary_lm_A <- vector('list', length(colNames))
summary_lm_B <- vector('list', length(colNames))
lm_B <- vector('list', length(colNames))
clz.lm <- vector('list', length(colNames))
summary_clz.lm <- vector('list', length(colNames))
ano.lm <- vector('list', length(colNames))
for(i in seq_along(colNames)) {
var <- colNames[i]
plt[[i]] <- ggplot(df, aes(MachineLength, .data[[var]], fill= Group, color= Group)) +
geom_smooth(method=lm) +
geom_point(size = 2, alpha=0.7) +
labs(title="Machine", subtitle = "Machine Type") +
theme_bw() +
theme(plot.title = element_text(hjust=0.5, face="bold"),
plot.subtitle = element_text(hjust=0.5))
lm_A[[i]] <- lm(sprintf('%s~MachineLength', var), data = subset(df, Group == "A"))
summary_lm_A[[i]] <- summary(lm_A[[i]])
lm_B[[i]] <- lm(sprintf('%s~MachineLength', var), data = subset(df, Group == "B"))
summary_lm_B[[i]] <- summary(lm_B[[i]])
clz.lm[[i]] <- lm(sprintf('%s~Group + MachineLength + Group:MachineLength', var), data = df)
summary_clz.lm[[i]] <- summary(clz.lm[[i]])
ano.lm[[i]] <- Anova(lm(sprintf('%s~MachineLength*Group', var), data = df))
}

Getting a variable to pass into function in R (ggplot2)

I'm trying to plot a graph between two columns of data from the data frame called "final". I want the p value and r^2 value to show up on the graph.
I'm using this function and code, but it gives me the error "cannot find y value"
library(ggplot2)
lm_eqn <- function(final, x, y){
m <- lm(final[,y] ~ final[,x])
output <- paste("r.squared = ", round(summary(m)$adj.r.squared, digits = 4), " | p.value = ", formatC(summary(m)$coefficients[8], format = "e", digits = 4))
return(output)
}
output_plot <- lm_eqn(final, x, y)
p1 <- ggplot(final, aes(x=ENSG00000153563, y= ENSG00000163599)) + geom_point() + geom_smooth(method=lm, se=FALSE) + labs(x = "CD8A", y = "CTLA-4") + ggtitle("CD8 v/s CTLA-4", subtitle = paste("Linear Regression of Expression |", output_plot))
How do I get both columns of data x and y to flow through the function and for the graph to plot with the p value and residual value printed on the graph?
Thanks in advance.
When you call function for output_plot generation you have to use the same ENS... variables as in your plot. After simplifying slightly function, should work now
library(stats)
library(ggplot2)
lm_eqn <- function(x, y){
m <- lm(y ~ x)
output <- paste("r.squared = ", round(summary(m)$adj.r.squared, digits = 4), " | p.value = ", formatC(summary(m)$coefficients[8], format = "e", digits = 4))
return(output)
}
x <-c(1,2,5,2,3,6,7,0)
y <-c(2,3,5,9,8,3,3,1)
final <- data_frame(x,y)
output_plot <- lm_eqn(x, y)
p1 <- ggplot(final, aes(x=x, y= y)) + geom_point() + geom_smooth(method=lm, se=FALSE) + labs(x = "x", y = "y") + ggtitle("CD8 v/s CTLA-4", subtitle = paste("Linear Regression of Expression |", output_plot))

Writing a function for ggplot

I am new to R and have been trying to figure this out for a while. Basically, I have a data frame, and various y variables. I am trying to write a function that will allow me to come up with a customized graph template for the many different y variables that I have. I am trying the following code below but I am met with this error:
1: In eval(expr, envir, enclos) : NAs introduced by coercion
2: In aes_string(xvar[max(which(complete.cases(yvar)))], yvar[max(which(complete.cases(yvar)))], :
NAs introduced by coercion
The code works if I add the variables in directly and not through a function. I believe that it is something to do with how the function plugs in the xvar into the as.numeric() function. I am not sure but any of you knows how to deal with this?
test <- function (Data, xvar, yvar){
# Plot data
plot <- ggplot(subset(Data,!is.na((yvar))), aes_string(xvar, yvar)) + geom_line(colour="darkblue") + theme_bw()
# Add Trendline for recent data
plot <- plot + geom_smooth(data=subset(Data, xvar > as.numeric(xvar)[max(which(complete.cases(yvar)))-8]), method = "lm")
# Label most recent data
plot + geom_text(data = Data, aes_string(xvar[max(which(complete.cases(yvar)))],
yvar[max(which(complete.cases(yvar)))],
label = as.numeric(yvar)[max(which(complete.cases(yvar)))],
hjust= -0.5, vjust = 0.5))
As xvar is probably (you do not show a reproducible example) a character vector of length 1, subsetting like xvar[] will not yield the desired result.
You could try something like
library(ggplot2)
f <- function(data, xvar, yvar) {
ggplot(data, aes_string(xvar, yvar)) +
geom_point() +
geom_smooth(data=subset(data, eval(parse(text=xvar)) > 5), method = "lm")
}
or
f <- function(data, xvar, yvar) {
ggplot(data, aes_string(xvar, yvar)) +
geom_point() +
geom_smooth(data = data[data[, xvar]>5, ], method = "lm")
}
f(mtcars, "cyl", "disp")
I think #LukeA has gotten you practically all the way there, but here is an example that uses your data and adds a few more columns to help demonstrate how you can pass column names into ggplot inside your own function.
It uses your variable names. It subsets your data into a data.frame with non-missing values for y, and then it subsets your data into a separate data.frame that allows you to add additional filtering criteria to your smoothing function.
library(zoo)
set.seed(72)
X1 <- as.yearqtr(seq(as.Date("2010/3/1"), by = "quarter", length.out = 10))
Y1 <- as.vector(c(124,315,363,574,345,434,141,512,142,647))
Y2 <- sample(Y1)
Y3 <- sample(Y1)
Data1 <- data.frame(X1, Y1, Y2, Y3)
plot_function <- function(data, xvar, yvar){
# remove rows with NA on yvar
mydata1 <- data[!is.na(data[, yvar]), ]
# remove rows with NA on yvar and subset yvar above some threshold
mydata2 <- data[!is.na(data[, yvar]) & data[, yvar] > 400, ]
# plot it
myplot <- ggplot(mydata1, aes_string(xvar, yvar)) +
geom_line(colour="darkblue") +
scale_x_yearqtr(limits = c(min(mydata1[, xvar]), max(mydata1[, xvar])), format = "%YQ%q") +
geom_smooth(data = mydata2, aes_string(xvar, yvar), method = "lm") +
geom_text(data = mydata1, aes_string(xvar, yvar, label = yvar), hjust= -0.5, vjust = 0.5) +
theme_bw()
return(myplot)
}
plot_function(data = Data1, xvar = "X1", yvar = "Y1")
plot_function(data = Data1, xvar = "X1", yvar = "Y2")
plot_function(data = Data1, xvar = "X1", yvar = "Y3")

Resources