Randomizing a distribution of data in a list - r

I have a data frame df that I would like to separate into a training set and a test set. Instead of getting only a single training and test set, I would like to get a distribution of them (n = 100).
I try and do this with lapply, but the values for each element in the list end up being exactly the same. How do I randomize the values in the two list (i.e., train.data and test.data)?
The expected output would be a list for both train.data and test.data, each containing 100 elements with different subsets of df in both of them.
library(lubridate)
library(tidyverse)
library(caret)
date <- rep_len(seq(dmy("01-01-2013"), dmy("31-12-2013"), by = "days"), 300)
ID <- rep(c("A","B","C"), 50)
class <- rep(c("N","M"), 50)
df <- data.frame(value = runif(length(date), min = 0.5, max = 25),
ID,
class)
training.samples <- df$class %>%
createDataPartition(p = 0.6, list = FALSE)
n <- 100
train.data <- lapply(1:n, function(x){
df[training.samples, ]
})
test.data <- lapply(1:n, function(x){
df[-training.samples, ]
})

Try using replicate
f1 <- function(dat, colnm) {
s1 <- createDataPartition(dat[[colnm]], p = 0.6,
list = FALSE)
return(list(train.data = dat[s1,], test.data = dat[-s1,]))
}
n <- 100
out <- replicate(n, f1(df, "class"), simplify = FALSE)

Related

Loop a function in r to create a new table

I have a dataframe in r and want to perform the levene's/ variance test on multiple variables with two groups and save all results in a table. I have tried to do this using a for() loop and sapply() but I get neither working:
df <- data.frame(
x = rnorm(100, 0, 1),
y = rnorm(100, 50, 1),
z = rnorm(100, 70, 2),
group = rep(c(0,1), each = 50)
)
varlist <- c("x","y","z")
res.var <- character(length(varlist))
res.f <- numeric(length(varlist))
res.p <- numeric(length(varlist))
Option 1)
for(i in seq_along(varlist)) {
form <- substitute(i ~ group, list(i = as.name(varlist)))
result <- var.test(
formula = form,
data = df)
res.var[i] <- varlist[i]
res.f[i] <- result$estimate
res.p[i] <- result$p.value
}
Option 2:
sapply(varlist, function(x) {
form <- substitute(i ~ group, list(i = as.name(varlist)))
result <- var.test(
formula = form,
data = df)
res.var[i] <- varlist[i]
res.f[i] <- result$estimate
res.p[i] <- result$p.value
})
Maybe there's an easier way to that this. I'd be glad for any help ;o) Thank you in advance.

How to find a particular x value for a given y value in a plot?

I have this data set and I plot F_1 against ks. I need to find the value of ks that has the maximum F_1 value.
set.seed(1)
library(caret)
library(dplyr)
library(modelr)
data("heights")
ks <- seq(1, 101, 3)
F_1 <- sapply(ks, function(k){
test_index <- createDataPartition(heights$sex, times = 1, p = 0.5, list = FALSE)
test_set <- heights[test_index, ]
train_set <- heights[-test_index, ]
fit <- knn3(sex ~ height, data = train_set, k = k)
y_hat <- predict(fit, test_set, type = "class") %>%
factor(levels = levels(train_set$sex))
F_meas(data = y_hat, reference = test_set$sex)
})
plot(ks, F_1)
I can get the maximum F_1 value from max(F_1). But how to get corresponding ks value for that maximum F_1 value?
To obtain the input value that corresponds to the maximum of the output you may simply make use of the index obtained from the output vector of your function.
Example:
f1 <- function(x){
-x^2
}
# Input Values
z <- -100:100
# Corresponding Input Value(s) to max output
z[f1(z) == max(f1(z))]

I'm getting Error in dat$y : $ operator is invalid for atomic vectors when trying to calculate the possible results using map() function

I am currently taking an online Data science: Machine learning course and we are asked to fit a lm 100 times and obtain the values of the mean (rmse) and sd(rmse) for data sets of different sizes n=c(100,500,1000,5000,10000).
we are asked to create a function that takes the size n and builds the dataset, then runs the loop made for fitting the 100 models, then set the seed and use a map() or sapply() function for applying our new function to the n different sizes.
The code I did is showing me "Error in dat$y : $ operator is invalid for atomic vectors" error when I run f1
This is my code:
library(MASS)
library(caret)
ff=function(n){
Sigma <- 9*matrix(c(1.0, 0.5, 0.5, 1.0), 2, 2)
dat <- MASS::mvrnorm(n, c(69, 69), Sigma)%>%data.frame() %>% setNames(c("x", "y"))
}
set.seed(1,sample.kind = "Rounding")
n=c(100,500,1000,5000,10000)
f1=map(n,function(dat){
rmse=replicate(100,{
y <- dat$y
test_index <- createDataPartition(y, times = 1, p = 0.5, list = FALSE)
train_set <- dat %>% slice(-test_index)
test_set <- dat %>% slice(test_index)
fit <- lm(y ~ x, data = train_set)
y_hat <- fit$coef[1] + fit$coef[2]*test_set$x
sqrt(mean((y_hat - test_set$y)^2))
})
structure(c(mean(rmse),sd(rmse)))
})
Thank you for your help!!
I think you should use something like :
library(caret)
library(dplyr)
n=c(100,500,1000,5000,10000)
f1= purrr::map(n,function(x){
rmse=replicate(100,{
dat <- ff(x)
y <- 1:nrow(dat)
test_index <- createDataPartition(y, times = 1, p = 0.5, list = FALSE)
train_set <- dat %>% slice(-test_index)
test_set <- dat %>% slice(test_index)
fit <- lm(y ~ x, data = train_set)
y_hat <- fit$coef[1] + fit$coef[2]*test_set$x
sqrt(mean((y_hat - test_set$y)^2))
})
c(mean(rmse),sd(rmse))
})

Using $ to refer to multiple variables in user defined function R

I am trying to run a function which calculates the marginal effects for different mixed effects models, based on two different main predictors (var1 vs. var2). The original code can be found here:
https://stats.idre.ucla.edu/r/dae/mixed-effects-logistic-regression/. Below is a reproducible example:
I create a dataframe (ex):
time <- seq(from = 1, to = 500, by =1)
var1 <- factor(sample(0:1, 500, replace = TRUE))
var2 <- factor(sample(0:1, 500, replace = TRUE))
var3 <- sample(1:500, 500, replace = TRUE)
group <- rep(1001:1005, 500)
out <- sample(0:1, 500, replace = TRUE)
group <- as.factor(group)
ex <- data.frame(time,var1,var2,var3,group,out)
Run the models:
m1a <- glmer(out ~ time + var1 + (1|group), data=ex, family = binomial(link = "logit"), nAGQ = 1,
control = glmerControl(calc.derivs = FALSE))
m1b <- glmer(out ~ time + var2 + (1|group), data=ex, family = binomial(link = "logit"), nAGQ = 1,
control = glmerControl(calc.derivs = FALSE))
Create subsets of the data with only the predictors for complete cases:
sub1a <- na.omit(ex[, c("time", "var1", "group")])
sub1b <- na.omit(ex[, c("time", "var2", "group")])
I cannot attach my data frame, ex, because R says var1 and var2 are masked. Therefore, the only way I know to refer to the variables is using $. However, every function I create produces a wrong or null result. I first tried:
marg <- function(v1, v2, d, m) {
biprobs <- lapply(levels(v1), function(var) {
v2[ ] <- var
lapply(time, function(ti) {
d$time <- ti
predict(m, newdata = d, type = "response")
})
})
plotdat <- lapply(biprobs, function(X) {
temp <- t(sapply(X, function(x) {
c(M=mean(x), quantile(x, c(.25, .75)))
}))
temp <- as.data.frame(cbind(temp,time))
colnames(temp) <- c("PP", "Lower", "Upper", "Dayssince")
return(temp)
})
plotdat <- do.call(rbind, plotdat)
}
result1 <- marg(ex$var1, sub1a$var1, sub1a, m1a)
Although this creates a data frame, it produces the same predicted probabilities for each level of var1 (0 vs. 1) at a given time (1-500), which is not what I want. So then I tried:
marg <- function(v, d, m) {
biprobs <- lapply(levels(ex$v), function(var) {
d$v[ ] <- var
lapply(time, function(ti) {
d$time <- ti
predict(m, newdata = d, type = "response")
})
})
.....
}
result2 <- marg(var1,sub1a, m1a)
This produces a null result. I also tried, which produces a null result:
marg <- function(d1,v,d2,m) {
biprobs <- lapply(levels(d1$v), function(var) {
d2$v[ ] <- var
lapply(time, function(ti) {
d2$time <- ti
predict(m, newdata = d2, type = "response")
})
})
......
}
result3 <- marg(ex,var1,sub1a,m1a)
I also tried creating a new object to input directly into the function:
v1 <- ex$var1
marg <- function(d, m) {
biprobs <- lapply(levels(v1), function(var) {
.....
})
})
That also produces a null result. How do I refer to different variables in an unattached data frame?? The code works with direct inputs, so it's a matter of correctly defining the function arguments. I appreciate any help!

Computing NTILE in R for RFM analysis

I'm trying to create a dataframe computing 10 percentiles based on the Recency, Frequency and Monetary. I have most of it set up, but I can't figure out why my code is returning three NTILES, when I'm asking for 10. I'm currently at a stand still. The next step will be calculating the percentage of customers in each ntile.
Here is my code:
rm(list = ls())
setwd("/Users/a76475/Documents/Customer_Analytics")
rfm<-read.csv("cdnow_students_transaction.csv")
#Set up test and calibration samples
rfm$DATE <- as.Date(rfm$DATE, format = "%m/%d/%y")
calib <- subset(rfm, rfm$DATE<"1997-09-29")
valid <- subset(rfm, rfm$DATE>"1997-09-30")
#Aggregate for frequency, monetary, and recency -- Calibration
recency<- aggregate(DATE ~ ID, data =calib, FUN = max)
colnames(recency) <- c("ID","Recency")
frequency <- aggregate(DOLLARS ~ ID, data =calib, FUN = length)
colnames(frequency) <- c("ID","Frequency")
monetary <- aggregate(DOLLARS ~ ID, data =calib, FUN = mean)
colnames(frequency) <- c("ID","Monetary")
calib <- merge(frequency, monetary, by = "ID")
calib <- merge(calib, recency, by = "ID")
#Aggregate for frequency, monetary, and recency -- Validation
recency<- aggregate(DATE ~ ID, data =valid, FUN = max)
colnames(recency) <- c("ID","Recency")
frequency <- aggregate(DOLLARS ~ ID, data =valid, FUN = length)
colnames(frequency) <- c("ID","Frequency")
monetary <- aggregate(DOLLARS ~ ID, data =valid, FUN = mean)
colnames(frequency) <- c("ID","Monetary")
valid <- merge(frequency, monetary, by = "ID")
valid <- merge(valid, recency, by = "ID")
colnames(valid) <- c("ID","FREQ","MONETARY","RECENCY")
colnames(calib) <- c("ID","FREQ","MONETARY","RECENCY")
calib$RECENCY <- NULL
#Create recency score
#For validation
for (i in 1:nrow(valid)) {
valid$RECENCY1[i] = as.numeric(max(valid$RECENCY) - valid$RECENCY[i])
}
valid$RECENCY <- valid$RECENCY1
valid$RECENCY1 <- NULL
#For calibration
for (i in 1:nrow(calib)) {
calib$RECENCY1[i] = as.numeric(max(valid$RECENCY) - calib$RECENCY[i])
}
calib$RECENCY <- calib$RECENCY1
calib$RECENCY1 <- NULL
#Merge datasets
rfm <- merge(calib,valid, by="ID", all.x = TRUE)
#Create Column for retention%
require(dplyr)
rfm$monetary.ntile <- ntile(rfm$MONETARY.y,10)
rfm$freq.ntile <- ntile(rfm$FREQ.y,10)
rfm$recency.ntile <- ntile(rfm$RECENCY,10)
For example, if you want 10 buckets for Recency, Frequency and Monetary Ratio:
set.seed(1)
n <- 100
df <- data.frame(
R = runif(n, 1, 365),
F = runif(n, 1, 5),
M = runif(n, 0, 100)
)
apply(df, 2, function(col) {
breaks <- quantile(col, probs=seq(0, 1, length.out = 10))
findInterval(col, breaks)
})

Resources