Getting NA irrespective of the factor conversion method used - r

I see this question has been asked a lot of times and I myself have tried all the methods to convert my factors to numeric including:
as.numeric(as.character(x))
lapply(x, function(y) as.numeric(levels(y))[y])
as.numeric.factor <- function(y) as.numeric(levels(y))[y]
lapply(x, as.numeric.factor)
I cannot use the unlist function because I want the output in the wide format and unlist converts it to long.
But all of them are giving NA values as output.
When I specifically check x, it shows the values appropriately.
> class(x)
[1] "data.frame"
> class(x$Col2)
[1] "factor"
> class(x$Col1)
[1] "Date"
Please find a reproducible example:
set.seed(354)
df <- data.frame(Product_Id = rep(1:100, each = 50),
Date = seq(from = as.Date("2014/1/1"), to = as.Date("2018/2/1") , by = "month"),
Sales = rnorm(100, mean = 50, sd= 20))
df <- df[-c(251:256, 301:312, 2551:2562, 2651:2662, 2751:2762), ]
library(zoo)
z <- read.zoo(df, index = "Date", split = "Product_Id", FUN = as.yearmon)
tt <- as.ts(z)
fcast <- matrix(NA, ncol = ncol(tt), nrow = 12)
library(forecast)
for(i in 1:ncol(tt)){
fc1 <- forecast(stlf(na.trim(tt[,i]),h=12))
fcast[,i] <-fc1$mean
}
forecasted <- format(round(rbind(tt, fcast),2),nsmall = 2)
dd <- data.frame(date = seq(as.Date('2014-01-01'), by = 'months', length = nrow(forecasted)),Prod_Id = forecasted)
dd_f <- lapply(dd, function(x) as.numeric(levels(x))[x])
Any suggestion as to what can I be missing?

Related

Randomizing a distribution of data in a list

I have a data frame df that I would like to separate into a training set and a test set. Instead of getting only a single training and test set, I would like to get a distribution of them (n = 100).
I try and do this with lapply, but the values for each element in the list end up being exactly the same. How do I randomize the values in the two list (i.e., train.data and test.data)?
The expected output would be a list for both train.data and test.data, each containing 100 elements with different subsets of df in both of them.
library(lubridate)
library(tidyverse)
library(caret)
date <- rep_len(seq(dmy("01-01-2013"), dmy("31-12-2013"), by = "days"), 300)
ID <- rep(c("A","B","C"), 50)
class <- rep(c("N","M"), 50)
df <- data.frame(value = runif(length(date), min = 0.5, max = 25),
ID,
class)
training.samples <- df$class %>%
createDataPartition(p = 0.6, list = FALSE)
n <- 100
train.data <- lapply(1:n, function(x){
df[training.samples, ]
})
test.data <- lapply(1:n, function(x){
df[-training.samples, ]
})
Try using replicate
f1 <- function(dat, colnm) {
s1 <- createDataPartition(dat[[colnm]], p = 0.6,
list = FALSE)
return(list(train.data = dat[s1,], test.data = dat[-s1,]))
}
n <- 100
out <- replicate(n, f1(df, "class"), simplify = FALSE)

rowmean and standard deviation using data.table

x <- matrix(rnorm(500 * 10), nrow = 500, ncol = 10)
x[, 1] <- 1:500
x <- data.frame(x)
names(x) <- c('ID', 2000:2008)
library(data.table)
setDT(x)
I want to calculate mean, sd and no. of data points per row but I am getting error
x[, c("meanY",'sdY',"nY") := .(rowMeans(.SD, na.rm = TRUE), sd(.SD, na.rm = TRUE), rowSums(!is.na(.SD))), .SDcols=c(2:10)]
The issues lies in sd() which doesn't work row-wise.
x[,
c("meanY",'sdY',"nY") :=
.(rowMeans(.SD, na.rm = TRUE),
apply(.SD, 1, sd, na.rm = TRUE),
rowSums(!is.na(.SD))),
.SDcols = 2:10]
Assuming the output as a list, you can use following code to have a try:
op <- c("mean","sd","length")
r <- lapply(op, function(v) apply(x, 1, eval(parse(text = v))))
names(r) <- op
where it should work with your data.frame example:
x <- matrix(rnorm(500 * 10), nrow = 500, ncol = 10)
x[, 1] <- 1:500
x <- data.frame(x)
names(x) <- c('ID', 2000:2008)

Error making line plot using xts R : cannot mix zero-length and non-zero-length coordinates

I'm attempting to slice two data frames, one representing the date (Date) and one representing a continuous value and graph them using a xts line graph. When I use the full dateframes it works, but when I slice to the last 7 values, it throws the following error. Any thoughts?
Error in segments(xlim[1], y_grid_lines(get_ylim()[[2]]), xlim[2], y_grid_lines(get_ylim()[[2]]), :
cannot mix zero-length and non-zero-length coordinates
library(xts)
Date <- seq(as.Date("2018-01-01"), as.Date("2018-02-25"), by="days")
Date <- as.POSIXct(Date, format = "%Y-%m-%d %H:%M")
#Date <-tail #Works
Date <-tail(Date,7) #Doesn't Work
T1_EF <- matrix( rnorm(N*M,mean=23,sd=3), N, M)
Trial1_EF<- as.matrix(round(Trial1_EF, digits = 6))
T1_EF_counts <- apply(Trial1_EF, 2, function(x) length(na.omit(x)))
#c <-T1_EF_counts #Works
c <-tail(T1_EF_counts,7)#doesn't work
datN <- data.frame(Date = Date, y = c)
datN.ts <- xts(datN$y, order.by = datN$Date)
plot(datN.ts)
lines(datN.ts, col = 'Green')
I found a solution for this here:
https://github.com/joshuaulrich/xts/issues/156
xls doesn't like when the y value is a constant.
here is the edited code:
library(xts)
Date <- seq(as.Date("2018-01-01"), as.Date("2018-02-25"), by="days")
Date <- as.POSIXct(Date, format = "%Y-%m-%d %H:%M")
#Date <-tail #Works
Date <-tail(Date,7) #Doesn't Work
T1_EF <- matrix( rnorm(N*M,mean=23,sd=3), N, M)
Trial1_EF<- as.matrix(round(Trial1_EF, digits = 6))
T1_EF_counts <- apply(Trial1_EF, 2, function(x) length(na.omit(x)))
#c <-T1_EF_counts #Works
c <-tail(T1_EF_counts,7)#doesn't work
datN <- data.frame(Date = Date, y = c)
datN.ts <- xts(datN$y, order.by = datN$Date)
plot(datN.ts, , ylim=c(300, 600))
lines(datN.ts, col = 'Green')

Computing NTILE in R for RFM analysis

I'm trying to create a dataframe computing 10 percentiles based on the Recency, Frequency and Monetary. I have most of it set up, but I can't figure out why my code is returning three NTILES, when I'm asking for 10. I'm currently at a stand still. The next step will be calculating the percentage of customers in each ntile.
Here is my code:
rm(list = ls())
setwd("/Users/a76475/Documents/Customer_Analytics")
rfm<-read.csv("cdnow_students_transaction.csv")
#Set up test and calibration samples
rfm$DATE <- as.Date(rfm$DATE, format = "%m/%d/%y")
calib <- subset(rfm, rfm$DATE<"1997-09-29")
valid <- subset(rfm, rfm$DATE>"1997-09-30")
#Aggregate for frequency, monetary, and recency -- Calibration
recency<- aggregate(DATE ~ ID, data =calib, FUN = max)
colnames(recency) <- c("ID","Recency")
frequency <- aggregate(DOLLARS ~ ID, data =calib, FUN = length)
colnames(frequency) <- c("ID","Frequency")
monetary <- aggregate(DOLLARS ~ ID, data =calib, FUN = mean)
colnames(frequency) <- c("ID","Monetary")
calib <- merge(frequency, monetary, by = "ID")
calib <- merge(calib, recency, by = "ID")
#Aggregate for frequency, monetary, and recency -- Validation
recency<- aggregate(DATE ~ ID, data =valid, FUN = max)
colnames(recency) <- c("ID","Recency")
frequency <- aggregate(DOLLARS ~ ID, data =valid, FUN = length)
colnames(frequency) <- c("ID","Frequency")
monetary <- aggregate(DOLLARS ~ ID, data =valid, FUN = mean)
colnames(frequency) <- c("ID","Monetary")
valid <- merge(frequency, monetary, by = "ID")
valid <- merge(valid, recency, by = "ID")
colnames(valid) <- c("ID","FREQ","MONETARY","RECENCY")
colnames(calib) <- c("ID","FREQ","MONETARY","RECENCY")
calib$RECENCY <- NULL
#Create recency score
#For validation
for (i in 1:nrow(valid)) {
valid$RECENCY1[i] = as.numeric(max(valid$RECENCY) - valid$RECENCY[i])
}
valid$RECENCY <- valid$RECENCY1
valid$RECENCY1 <- NULL
#For calibration
for (i in 1:nrow(calib)) {
calib$RECENCY1[i] = as.numeric(max(valid$RECENCY) - calib$RECENCY[i])
}
calib$RECENCY <- calib$RECENCY1
calib$RECENCY1 <- NULL
#Merge datasets
rfm <- merge(calib,valid, by="ID", all.x = TRUE)
#Create Column for retention%
require(dplyr)
rfm$monetary.ntile <- ntile(rfm$MONETARY.y,10)
rfm$freq.ntile <- ntile(rfm$FREQ.y,10)
rfm$recency.ntile <- ntile(rfm$RECENCY,10)
For example, if you want 10 buckets for Recency, Frequency and Monetary Ratio:
set.seed(1)
n <- 100
df <- data.frame(
R = runif(n, 1, 365),
F = runif(n, 1, 5),
M = runif(n, 0, 100)
)
apply(df, 2, function(col) {
breaks <- quantile(col, probs=seq(0, 1, length.out = 10))
findInterval(col, breaks)
})

How to do calculations on elements from a sublist in R

my code is as follows:
x <- data.frame(matrix(rnorm(20), nrow=10))
colnames(x) <- c("z", "m")
n_boot<-4
bs <- list()
for (i in 1:n_boot) {
bs[[i]] <- x[sample(nrow(x), 10, replace = TRUE), ]
}
bt<-matrix(unlist(bs), ncol = 2*n_boot, byrow = FALSE)
colnames(bt) <- rep(c("z","m"),times=n_boot)
M_to_boot <- bt[,seq(2,8,by=2)]
funct<-function(M_boot_max) {
od<-(1/((10*((10^((16-M_boot_max-25)/5))^3)/3)*((max(M_boot_max)-min(M_boot_max))/50)))
}
V_boot<-apply(M_to_boot,2,funct)
rows.combined <- nrow(M_to_boot)
cols.combined <- ncol(M_to_boot) + ncol(V_boot)
matrix.combined <- matrix(NA, nrow=rows.combined, ncol=cols.combined)
matrix.combined[, seq(1, cols.combined, 2)] <- M_to_boot
matrix.combined[, seq(2, cols.combined, 2)] <- V_boot
colnames(matrix.combined) <- rep(c("M_boot","V_boot"),times=n_boot)
df<-as.data.frame(matrix.combined)
start0 <- seq(1, by = 2, length = ncol(df) / 2)
start <- lapply(start0, function(i, df) df[i:(i+1)], df = df)
tests<-lapply(start, function(xy) split(xy, cut(xy$M_boot,breaks=5)))
Now I want to prepare some calculations on values V_boot from a sublists. To be specific I want to for each subsample calculate the sum of V_boot. So, for example I want for a bin M_boot "[[4]]$(0.811,1.25]" to have a value of sum(V_boot) for that bin. But I cannot figure out how to get to that each V_boot values.
Please help me.

Resources