Fitting decays in R - r

I have a data like that I want to fit decay.
library(tidyverse)
library(broom)
t = 1:100
x=1:80
y1=sample(seq(from = 20, to = 50), size = 100, replace = TRUE)
y1<-y1 %>% jitter()
y2 = 24 + (60 - 24) * -0.01 * x %>% jitter(10)
df1 <- tibble(t = t, y = y1, sensor = 'sensor1') %>%
rbind(. , data.frame(t = x, y = y2, sensor = 'sensor2'))
fit <- nls(y ~ SSasymp(t, yf, y0, log_alpha), data = sensor1)
fit
# Fit the data
fitted <- df %>%
nest(-sensor) %>%
mutate(
fit = map(data, ~nls(y ~ SSasymp(t, yf, y0, log_alpha), data = .)),
tidied = map(fit, tidy),
augmented = map(fit, augment),
)
And I got :
Error in mutate():
! Problem while computing fit = map(data, ~nls(y ~ SSasymp(t, yf, y0, log_alpha), data = .)).
Caused by error in nls():
! singular gradient
Could you please let me where was the issue.

Related

How can you force predictions of a linear model to start at the same point as new data

I would like my logistic regression model to start at the same point as a predictor variable.
Data:
df <- tibble(
x = c(0:20, 0:20),
y = c(log(10:30 + 2), log(10:30 + 10)),
init = c(rep(log(10 + 2), 21), rep(log(10 + 10), 21)),
group = c(rep('A', 21), rep('B', 21))
)
Model:
lm_fit <- lm(y ~ log(x + 1) + init, data = df)
Example of model fitted to data:
newdata <- df %>%
filter(group == 'A') %>%
mutate(pred_y = predict(lm_fit, newdata = newdata, type = 'response')) %>%
pivot_longer(c(y, pred_y), names_to = 'pred_type', values_to = 'value')
ggplot(aes(x, value, colour = pred_type)) +
geom_point() +
geom_line()
How can I change my model so the red line (model) starts at the same value as the blue line (data)? i.e. when x=0, pred_y = y.
Using your init variable, you have to treat it as an offset (its coefficient will be 1) and disable the intercept (-1 in model formula).
lm_fit <- lm(y ~ log(x + 1) + offset(init) - 1, data = df)
After changing the model formula to log(y) ~ log(x + 1) a possible approach is to transform the y variable and use its new value in x = 0 for the offset (init) variable (I would actually recommend to always derive the offset from the y variable and not compute it independently). This way only the data is modified and the rest will remain the same.
df <- df %>%
group_by(group) %>%
mutate(y = log(y),
init = y[x==0])

How to generate 1000 data sets and use Ridge and Lasso regression for all of them in R?

So I need to generate 1000 data sets with 200 observations in R from this model: model
and use Lasso and Ridge regression for all of them. Then I need to get beta_j coefficients for Lasso and Ridge. Can anyone help? Thank you already!
The setup is as you described in the image:
library(magrittr)
library(tidyverse)
library(glmnet)
M <- 9
beta <- c(c(0, 3, 2, 1, 0.5, 0.3),
rep(0, 10 - 6))
beta <- beta[-1] #glmnet contains the intercept
sigma <- diag(M) + 0.5 - 0.5 * diag(M)
sigma
N <- 200
G <- 1000
Now, to make the X and the right beta:
Xj <- mvtnorm::rmvnorm(n = N, sigma = sigma) %>%
set_colnames(paste0("x_", seq_len(ncol(.))))
# X <- cbind(intercept = 1, Xj) # glmnet contains the intercept
X <- Xj
epsilon <- rnorm(n = N, sd = 0.5)
beta %>% length
X %>% ncol()
y <- tcrossprod(beta, X) + epsilon
y
For each dataset, to model estimates has to be found:
list(
lasso =
glmnet::cv.glmnet(
X, y, family = "gaussian",
alpha = 1,
intercept = FALSE
),
ridge =
glmnet::cv.glmnet(
X, y, family = "gaussian",
alpha = 0,
intercept = FALSE
)
) %>%
print() %>%
map_df(. %>% coef() %>% as.matrix() %>% t() %>% as_tibble(), .id = "type")
Now, one could use replicate but the number of datasets is very large.
We will have to use parallel programming here...
library(furrr)
plan(multisession, workers = 4)
seq_len(G) %>%
# seq_len(50) %>%
furrr::future_map_dfr(
~ {
Xj <- mvtnorm::rmvnorm(n = N, sigma = sigma) %>%
set_colnames(paste0("x_", seq_len(ncol(.))))
# X <- cbind(intercept = 1, Xj) # glmnet contains the intercept
X <- Xj
epsilon <- rnorm(n = N, sd = 0.5)
y <- tcrossprod(beta, X) + epsilon
list(
lasso =
glmnet::cv.glmnet(
X, y, family = "gaussian",
alpha = 1,
intercept = FALSE,
parallel = FALSE
),
ridge =
glmnet::cv.glmnet(
X, y, family = "gaussian",
alpha = 0,
intercept = FALSE,
parallel = FALSE
)
) %>%
# print() %>%
map_df(. %>%
coef() %>%
as.matrix() %>%
t() %>%
as_tibble(), .id = "type") %>%
mutate(rep = .x)
},
.progress = TRUE,
.options = furrr_options(seed = TRUE)
) ->
results
This will give a progress-bar, and a reps column that ties with dataset
belongs to which model estimates.
Let us try to summarise these results somehow:
results %>%
glimpse() %>%
pivot_longer(c(`(Intercept)`, starts_with("x_")),
names_to = "parameter", values_to = "estimate") %>%
glimpse() %>%
# ggplot(aes(estimate, group = interaction(type, parameter))) +
ggplot(aes(estimate)) +
geom_vline(data = tibble(true_beta = beta, parameter = paste0("x_", 1:9)) %>%
add_row(true_beta = 0, parameter = "(Intercept)"),
aes(xintercept = true_beta)) +
# geom_density() +
stat_bin(geom = "step", aes(y = after_stat(density))) +
facet_grid(type ~ parameter, scales = "free") +
ggpubr::theme_pubclean()
For each parameter, there are a bunch of estimates, and they are then plotted
as a histogram, and then the true values are vertical lines:
The results are quite surprising to me atleast:
Instead of coef one can use glmnet::coef.glmnet, and provide s = c("lambda.1se", "lambda.min"). Just for fun, here's how the plot would look if both of these hyper-parameter lambdas was used:

Overlay decision boundary for random forests and boostings

I generate some random data and am trying to overlay a decision boundary based upon fitting using random forests and boosting. I can recreate the problem below. I generate the data, and using regression trees I can easily overlay the decision boundary using the following code:
library(tidyverse)
# set seed and generate some random data
set.seed(123)
Dat <- tibble(
x1 = rnorm(100),
x2 = rnorm(100)
) %>% mutate(y = as_factor(ifelse(x1^2 + x2^2 > 1.39, "A", "B")))
circlepts <- tibble(theta = seq(0, 2*pi, length = 100)) %>%
mutate(x = sqrt(1.39) * sin(theta), y = sqrt(1.39) * cos(theta))
# graph the data and draw the boundary
p <- ggplot(Dat) + geom_point(aes(x1, x2, color = y)) + coord_fixed() +
geom_polygon(data = circlepts, aes(x, y), color = "blue", fill = NA)
# convert character to binary inputs making classification easier
binVec = as.vector(Dat$y)
binVec[which(binVec =="A")] = 1
binVec[which(binVec == "B")] = 0
binVec = as.numeric(binVec)
Dat$y = binVec
# split the data up
datasplit <- initial_split(Dat, prop = 0.7)
training_set <- as_tibble(training(datasplit))
testing_set <- as_tibble(testing(datasplit))
tree_fit <- tree(y~ ., training_set)
grid <- crossing(x1 = modelr::seq_range(testing_set$x1, 50), x2 = modelr::seq_range(testing_set$x1, 50)) %>%
modelr::add_predictions(tree_fit)
# plot the data with the decision overlay of the tree fit
p + geom_contour(data = grid, aes(x2, x1, z = as.numeric(pred)), binwidth = 1)
Now if I try doing so with random forests or gradient boosting, add_predictions doesn't cooperate that well...
rf_fit <- randomForest(y ~ ., data=training_set, mtry = 2, ntree=500)
grid <- crossing(x1 = modelr::seq_range(testing_set$x1, 50), x2 = modelr::seq_range(testing_set$x1, 50)) %>%
modelr::add_predictions(rf_fit)
p + geom_contour(data = grid, aes(x2, x1, z = as.numeric(pred)), binwidth = 1)
##ERROR: Error in if (is.na(out.type)) stop("type must be one of 'response', 'prob', 'vote'") : argument is of length zero
And for gradient boosting:
fitBoost <- gbm(y ~ ., data= Dat, distribution = "gaussian",
n.trees = 1000)
pred <- predict(fitBoost, newdata=training_set, n.trees=1000)
grid <- crossing(x1 = modelr::seq_range(testing_set$x1, 50), x2 = modelr::seq_range(testing_set$x1, 50)) %>%
modelr::add_predictions(fitBoost)
### ERROR: Error in paste("Using", n.trees, "trees...\n") : argument "n.trees" is missing, with no default
It seems to be a very simple problem. Could someone help me out?
The following code works with your random forest:
training_set$y <- factor(training_set$y)
rf_fit <- randomForest(y ~ ., data=training_set, mtry=2, ntree=500)
grid <- crossing(x1 = modelr::seq_range(testing_set$x1, 50),
x2 = modelr::seq_range(testing_set$x1, 50)) %>%
modelr::add_predictions(rf_fit)
p + geom_contour(data = grid, aes(x2, x1, z = as.numeric(pred)), binwidth = 1)
And here is the code for the gradient boosting machine:
fitBoost <- gbm(y ~ ., data=Dat, distribution="gaussian", n.trees=1000)
pred <- predict(fitBoost, newdata=training_set, n.trees=1000)
add_predictions2 <- function (data, model, var = "pred", type = NULL)
{
data[[var]] <- predict2(model, data, type = type)
data
}
predict2 <- function (model, data, type = NULL)
{
if (is.null(type)) {
stats::predict(model, data, n.trees=1000)
} else {
stats::predict(model, data, type = type, n.trees=1000)
}
}
grid <- crossing(x1 = modelr::seq_range(testing_set$x1, 50),
x2 = modelr::seq_range(testing_set$x1, 50)) %>%
add_predictions2(fitBoost)
p + geom_contour(data = grid, aes(x2, x1, z = as.numeric(pred)), binwidth = 1)

Histogram of AIC for each models

Hello How can I create a histogram for the difference of the AICs of each models to the AIC of the full model.?
#AIC of the full model
Y <- modelTT$aic
#AICs for each of the n models.
X <- lapply(listOfModels,function(xx) xx$aic)
so basically I want to do the X - Y first. Then I need to create the histogram of each of the difference values from largest to smallest.
Another alternative using broom()
df = data.frame(a = sample(1:10, replace = TRUE, 24),
b = sample(25:40, replace = TRUE, 24),
c = sample(0:1, replace = TRUE, 24))
model1 = lm(a ~ b + c, df)
model2 = lm(b ~ c, df )
model3 = lm(a ~ c, df)
library(broom)
library(ggplot2)
library(dplyr)
mod1 = glance(model1) %>% mutate(model = "m1")
mod2 = glance(model2) %>% mutate(model = "m2")
mod3 = glance(model3) %>% mutate(model = "m3")
models = bind_rows(mod1, mod2, mod3)
models %>% ggplot(aes(model,AIC)) + geom_bar(stat = "identity")
Gives the following
A generic data.frame
db<-data.frame(y=c(1,2,3,4,5,6,7,8,9),x1=c(9,8,7,6,5,4,3,2,1),x2=c(9,9,7,7,5,5,3,3,1))
A list of lm models
LM_modesl<-NULL
LM_modesl[[1]]<-lm(y ~ x1+x2 , data = db)
LM_modesl[[2]] <- lm(y ~ x1 , data = db)
LM_modesl[[3]] <- lm(y ~ x2 , data = db)
AIC calculation
AIC<-lapply(LM_modesl,AIC)
Decreasing plot
plot(sort(unlist(AIC),decreasing = T),type="h")

Multiple Regression lines in ggplot2

here is a test code and I don't understand why is not working as expected. Is a ggplot2 question, not an R one.
library(ggplot2)
K = 10
x <- 1:100/100
y <- sapply (x, FUN= function(x) 1+x)
xy <- data.frame(x,y)
set.seed(1234)
xy$yrand <- xy$y + runif(100,min = -0.35, max = 0.5)
folds <- cut(seq(1, nrow(xy)), breaks = K, labels = FALSE)
p1 <- ggplot(xy, aes(x = xy$x, y = xy$yrand))+geom_point() +ggtitle ("Simple
x vs y plot with added random noise") + xlab("X") + ylab("Y")
for(i in 1:K){
#Segement your data by fold using the which() function
testIndexes <- which(folds==i,arr.ind=TRUE)
testData <- xy[testIndexes, ]
trainData <- xy[-testIndexes, ]
lmTemp <- lm(yrand ~ x, data = trainData)
p1 <- p1 + geom_line(data = trainData, aes(x = trainData$x, y = predict(lmTemp, newdata = trainData)))
}
p1
Now what I would like to see is a plot with 10 lines (the regression lines). But I only see one. Can someone help me out? Is the ggplot2 syntax that is wrong?
Thanks, Umberto
EDITED:
I marked the answer I got since it is a nice way of doing it. I just wanted to add a simple way of doing it preparing the datasets for the graph I wanted to create. I think this method is slightly easier to understand if you don't have so much R experience.
for(i in 1:K){
#Segement your data by fold using the which() function
testIndexes <- which(folds==i,arr.ind=TRUE)
testData <- xy[testIndexes, ]
trainData <- xy[-testIndexes, ]
lmTemp <- lm(yrand ~ x, data = trainData)
# Let's build a data set for the lines
fitLines <- rbind(fitLines, data.frame(rep(paste("set",i),nrow(trainData)),trainData[,1], predict(lmTemp, newdata = trainData)))
}
names(fitLines) <- c("set", "x","y")
p1 + geom_line(data = fitLines, aes(x = x, y = y, col = set))
And this is what you get
You could use the crossv_kfold()function from the modelr-package, and put your complete code into a "pipe-workflow":
library(modelr)
library(tidyverse)
x <- 1:100/100
y <- sapply (x, FUN= function(x) 1+x)
xy <- data.frame(x,y)
set.seed(1234)
xy$yrand <- xy$y + runif(100,min = -0.35, max = 0.5)
xy %>%
crossv_kfold() %>%
mutate(
models = map(train, ~ lm(yrand ~ x, data = .x)),
predictions = map2(models, test, ~predict(.x, newdata = .y, type = "response"))
) %>%
select(-train, -test, -models) %>%
unnest() %>%
bind_cols(xy) %>%
ggplot(aes(x = x, y = predictions)) +
stat_smooth(aes(colour = .id), method = "lm", se = FALSE) +
geom_point(aes(y = yrand))
Putting the colour-aes inside the ggplot-call would also map the points to the groups:
xy %>%
crossv_kfold() %>%
mutate(
models = map(train, ~ lm(yrand ~ x, data = .x)),
predictions = map2(models, test, ~predict(.x, newdata = .y, type = "response"))
) %>%
select(-train, -test, -models) %>%
unnest() %>%
bind_cols(xy) %>%
ggplot(aes(x = x, y = predictions, colour = .id)) +
stat_smooth(, method = "lm", se = FALSE) +
geom_point(aes(y = yrand))

Resources