reducing the data cleaning with more efficient and faster functions - r

I have a huge df with 10 million observations and 50 variables as x. Currently I'm using "grepl", "str_replace" and "gsub" functions as follows for data cleaning which are very time consuming (each line 5 mins).
Is there any more efficient function or way to rewrite the codes to reduce run time, please?
x <-x[!grepl("A",x$ITEM_1, perl=TRUE,]
x <-x[!grepl("B",x$ITEM_1),perl=TRUE,]
x <-x[!grepl("C",x$ITEM_1),perl=TRUE,]
x <-x[!grepl("D",x$ITEM_1),perl=TRUE,]
x <-x[!grepl("E",x$ITEM_2),perl=TRUE,]
x <- x %>% mutate_at(vars(2:50), funs(gsub("\\?", "", .,perl=TRUE)))
x$SUBNAMEZ <- str_replace(x$SUBNAMEZ,"#","")
x$SUBNAMEZ <- str_replace(x$SUBNAMEZ,"#","")
x$SUBNAMEZ <- str_replace(x$SUBNAMEZ,"~","")
x$SUBNAMEZ <- str_replace(x$SUBNAMEZ,"\\(","")
x$SUBNAMEZ <- str_replace(x$SUBNAMEZ,"\\)","")
x$SUBNAMEZ <- str_replace(x$SUBNAMEZ,"&","")
x$SUBNAMEZ <- str_replace(x$SUBNAMEZ,"\\\\","")
x$SUBNAMEZ <- str_replace(x$SUBNAMEZ,"/","")
Regards,

The following shows comparative timings of the OP's code in the question and a simplification of that code.
It was tested with a dataframe of n = 10000 rows and 50 character column-vectors. The speedup is worthwhile.
library(dplyr)
library(stringr)
library(stringi)
library(microbenchmark)
fun.OP <- function(x){
x <- x[!grepl("A", x$ITEM_1, perl = TRUE), ]
x <- x[!grepl("B", x$ITEM_1, perl = TRUE), ]
x <- x[!grepl("C", x$ITEM_1, perl = TRUE), ]
x <- x[!grepl("D", x$ITEM_1, perl = TRUE), ]
x <- x[!grepl("E", x$ITEM_2, perl = TRUE), ]
x <- x %>% mutate_at(vars(2:ncol(x)), list(~gsub("\\?", "", .,perl=TRUE)))
x$SUBNAMEZ <- str_replace_all(x$SUBNAMEZ,"#","")
x$SUBNAMEZ <- str_replace_all(x$SUBNAMEZ,"#","")
x$SUBNAMEZ <- str_replace_all(x$SUBNAMEZ,"~","")
x$SUBNAMEZ <- str_replace_all(x$SUBNAMEZ,"\\(","")
x$SUBNAMEZ <- str_replace_all(x$SUBNAMEZ,"\\)","")
x$SUBNAMEZ <- str_replace_all(x$SUBNAMEZ,"&","")
x$SUBNAMEZ <- str_replace_all(x$SUBNAMEZ,"\\\\","")
x$SUBNAMEZ <- str_replace_all(x$SUBNAMEZ,"/","")
x
}
fun.Rui <- function(x){
x <- x[!grepl('[A-D]', x$ITEM_1, perl = TRUE), ]
x <- x[!grepl('E', x$ITEM_2, perl = TRUE), ]
x[2:ncol(x)] <- lapply(x[2:ncol(x)], function(y) stri_replace_all_fixed(y, '?', ''))
x$SUBNAMEZ <- stri_replace_all_regex(x$SUBNAMEZ, '#|#|~|\\(|\\)|&|/|', '')
x$SUBNAMEZ <- stri_replace_all_regex(x$SUBNAMEZ, '\\\\', '')
row.names(x) <- NULL
x
}
y1 <- fun.OP(x)
y2 <- fun.Rui(x)
dim(y1)
dim(y2)
identical(y1, y2)
mb <- microbenchmark(
OP = fun.OP(x),
Rui = fun.Rui(x)
)
print(mb, order = 'median')
#Unit: milliseconds
# expr min lq mean median uq max neval cld
# Rui 17.05596 17.21667 21.41270 17.30466 17.44592 62.58906 100 a
# OP 42.88685 43.25211 54.68897 43.53331 43.98865 501.98495 100 b
Data creation code.
makeString <- function(col, N){
y <- character(N)
if(col == 1){
L <- LETTERS
}else if(col == 2){
L <- c(LETTERS, '?')
} else{
L <- c(LETTERS, '#', '#', '~', '(', ')', '\\', '/')
}
for(i in seq_len(N)){
y[i] <- paste(sample(L, sample(50, 1), TRUE), collapse = '')
}
y
}
set.seed(1234)
n <- 1e4
x <- lapply(1:50, function(i) makeString(i, n))
names(x) <- sprintf("V%02d", seq_along(x))
x <- do.call(cbind.data.frame, x)
names(x)[1:3] <- c('ITEM_1', 'ITEM_2', 'SUBNAMEZ')

Related

How to simplifying this R Code to detect repeated sequence

I found this below function to detect repeated sequence. I integrate the function into Monte Carlo Simulation to calculate the probability. The function I have is too long and takes too much time during the simulation. I would appreciate if anyone can help to simply the function and in turn fasten any simulation depends on it.
V1 <- c(68,71,72,69,80,78,80,81,84,82,67,73,65,68,66,70,69,72,74,73,68,75,70,72,75,73,69,75,74,79,80,78,80,81,79,82,69,73,67,66,70,72,69,72,75,80,68,69,71,77,70,73)
Check_repeat_Seq <- function(vector){
k <- 2:25
Lall <- setNames(lapply(k, function(i) table(zoo::rollapply(vector, width = i, toString))), k)
L <- Filter(length, lapply(Lall, function(x) x[x == max(x) & x > 1]))
dat <- data.frame(seq_rep=sapply(L, length))
dat$repeat_length <- as.numeric(rownames(dat))
return(max(dat$repeat_length))
}
Check_repeat_Seq(V1)
#### Can you please simplify the following also to calculate the sum of repeated.####
Check_repeat_Seq_no_overlap_sum <- function(vector){
k <- 2:25
Lall <- setNames(lapply(k, function(i) table(zoo::rollapply(vector, width = i, toString))), k)
L <- Filter(length, lapply(Lall, function(x) x[x == max(x) & x > 1]))
vec <- unlist(unname(L))
nms <- names(vec)
is_le <- function(i) any(grepl(nms[i], tail(nms, -i)) & (vec[i] <= tail(vec, -i)))
LL <- vec[ ! sapply(seq_along(nms), is_le) ]
dat <- data.frame(seq_rep=sapply(L, length))
dat$repeat_length <- as.numeric(rownames(dat))
dat$total_repeat <- dat$seq_rep*dat$repeat_length
return(sum(dat$total_repeat))
}
##### the original function should return data Frame as follows
Check_All_repeat_Seq<- function(vector){
k <- 2:25
Lall <- setNames(lapply(k, function(i) table(zoo::rollapply(vector, width = i, toString))), k)
L <- Filter(length, lapply(Lall, function(x) x[x == max(x) & x > 1]))
vec <- unlist(unname(L))
nms <- names(vec)
is_le <- function(i) any(grepl(nms[i], tail(nms, -i)) & (vec[i] <= tail(vec, -i)))
LL <- vec[ ! sapply(seq_along(nms), is_le) ]
dat <- data.frame(seq_rep=sapply(L, length))
dat$repeat_length <- as.numeric(rownames(dat))
dat$total_repeat <- dat$seq_rep*dat$repeat_length
return(sum(dat))
}
please help simplifying the code with the same output
Update
An even faster iterative approach leveraging the Cantor pairing function:
allDup <- function(x) {
duplicated(x) | duplicated(x, fromLast = TRUE)
}
fPair <- function(i, j) {
# Cantor pairing function
k <- j + (i + j)*(i + j + 1L)/2L
match(k, unique(k))
}
Check_repeat_Seq3 <- function(v) {
v <- match(v, unique(v))
vPair <- fPair(head(v, -1), tail(v, -1))
blnKeep <- allDup(vPair)
idx <- which(blnKeep)
len <- 1L
while (length(idx)) {
len <- len + 1L
vPair <- fPair(vPair[blnKeep], v[idx + len])
blnKeep <- allDup(vPair)
idx <- idx[blnKeep]
}
return(len)
}
# benchmark against the rollaply solution
V1 <- c(68,71,72,69,80,78,80,81,84,82,67,73,65,68,66,70,69,72,74,73,68,75,70,72,75,73,69,75,74,79,80,78,80,81,79,82,69,73,67,66,70,72,69,72,75,80,68,69,71,77,70,73)
Check_repeat_Seq <- function(vector){
k <- 2:25
Lall <- setNames(lapply(k, function(i) table(zoo::rollapply(vector, width = i, toString))), k)
L <- Filter(length, lapply(Lall, function(x) x[x == max(x) & x > 1]))
dat <- data.frame(seq_rep=sapply(L, length))
dat$repeat_length <- as.numeric(rownames(dat))
return(max(dat$repeat_length))
}
Check_repeat_Seq(V1)
#> [1] 4
Check_repeat_Seq3(V1)
#> [1] 4
microbenchmark::microbenchmark(Check_repeat_Seq(V1), Check_repeat_Seq3(V1))
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> Check_repeat_Seq(V1) 38445.7 40860.95 43153.058 42249.25 44051.15 60593.2 100
#> Check_repeat_Seq3(V1) 103.9 118.65 150.713 149.05 160.05 465.2 100
Original Solution
Check_repeat_Seq2 <- function(v) {
m <- matrix(c(head(v, -1), tail(v, -1)), ncol = 2)
idx <- which(duplicated(m) | duplicated(m, fromLast = TRUE))
len <- 2L
while (length(idx)) {
len <- len + 1L
m <- matrix(v[sequence(rep(len, length(idx)), idx)], ncol = len, byrow = TRUE)
idx <- idx[duplicated(m) | duplicated(m, fromLast = TRUE)]
}
return(len - 1L)
}
UPDATE 2
This should return your dat data.frame:
Check_repeat_Seq3 <- function(v) {
v1 <- match(v, unique(v))
vPair <- fPair(head(v1, -1), tail(v1, -1))
blnKeep <- allDup(vPair)
idx <- which(blnKeep)
if (length(idx)) {
len <- 1L
seq_rep <- integer(length(v)/2)
while (length(idx)) {
len <- len + 1L
vPair <- fPair(vPair[blnKeep], v1[idx + len])
blnKeep <- allDup(vPair)
seq_rep[len] <- nrow(unique(matrix(v[sequence(rep(len, length(blnKeep)), idx)], ncol = len, byrow = TRUE)))
idx <- idx[blnKeep]
}
len <- 2:len
return(data.frame(seq_rep = seq_rep[len], repeat_length = len, total_repeat = seq_rep[len]*len))
} else {
return(data.frame(seq_rep = integer(0), repeat_length = integer(0), total_repeat = integer(0)))
}
}

Sample function in R vector combining

I need your help, I need to combine two vectors(z and Num1 or Num2), so z will 10 in final vector and Num1(Num2) was 90 in final vector.
Code that I have now:
I <- seq(1:100)
NA1<-vector()
NA2<-vector()
z <- rep(NA, 10)
Num1 <- rnorm(100)
Num2 <- rnorm(100)
vect_1 <- sample(c(Num1, z))
vect_2 <- sample(c(Num2, z))
vect_1_NA <- is.na(vect_1)
vect_2_NA <- is.na(vect_2)
for(i in I){
if(vect_1_NA[i] == TRUE)
NA1 <- append(NA1, i)
}
for(i in I){
if(vect_2_NA[i] == TRUE)
NA2 <- append(NA2, i)
}

Pre-allocation and optimization loop

My R script have the form:
for (j in 1:N) {
#construct the DF2 data frame
#operations on the DF2 data frame
}
Where N can be large (like a 1 mln). The columns of DF2 are defined
one after the other with the formula:
DF2$column_i <- function(x,f..) #or constant or ....
DF$column_i can are a constant, a function or a loop "while". I tried to pre allocate defining DF2 before with:
DF2 <- data.frame(matrix(nrow=..,ncol=..))
and computing after the columns DF2$column_i, but I have not had any benefits.
Does anyone have any ideas?
My code is of the type:
par <- data.frame(CA=runif(n = 50, min = 70000, max = 100000),
D=round(runif(n = 50, min = 70, max = 90),0),
P=runif(n = 50, min = 900, max = 20000),
A=round(runif(n = 50, min = 50, max = 70),0))
parpa <- data.frame(matrix(nrow = nrow(par), ncol = 3*V))
comp <- function(CA, D, P, A){
vect <- rep('numeric', 3*V)
b <- 1
k <- 1
while (((b+1) <= (D+1))&(k < V)) {
a <- b+1
b <- min((a+8-1), (D+1))
vect[c(1+4*k, 2+4*k, 3+4*k, 4+4*k)] <- c(mean(DF2$Z[a:b]), sum(DF2$X[a:b]),
mean(DF2$Q[a:b]), sum(DF2$AE[a:b]))
k <- k+1
}
return(vect)
}
#loop
for (j in 1:nrow(par)) {
CA <- par$CA[j]
D <- par$D[j]
R <- 0.01*D
P <- par$P[j]
A <- par$A[j]
COST <- 500
V <- 5
#DF2
DF2 <- data.frame(M=0:D)
OB <- function(x) {
c <- COST*D*DF2$M/R
return(c)
}
DF2$O <- O(D)
DF2$E <- (D*DF2$M+2)/D*(D+4)
DF2$Q <- (CA-DF2$M)*D
DF2$X <- (CA-DF2$O)*(DF2$E+P)
Func <- function(x) {return(round(x/30, 2))}
DF2$Z[(A+2):(D+1)] <- sapply(DF2$E[(A+2):(D+1)], Func)
parpa[j,] <- comp(CA, D, P, A)
}

Sum value by Combine all variable using R

Can somebody help me with data manipulation using R? i have data (data.train) like this
datex <- rep(c(rep("01/01/17",6),rep("02/01/17",6),rep("03/01/17",6)),1)
datex <- as.Date(datex, "%d/%m/%y")
Ax <- rep("A1",18)
Bx <- rep(c(rep("B1",3),rep("B2",3)),3)
Cx <- rep(c("C1","C2","C3"),6)
valx <- 100
for(i in 1:17){valx[i+1] <- valx[i]+1}
data.train <- data.frame(datex, Ax, Bx, Cx, valx)
i need all combination from variable and the final form is like this
I have tried this code:
### Library
library(dplyr)
## datex
datex <- rep(c(rep("01/01/17",6),rep("02/01/17",6),rep("03/01/17",6)),1)
datex <- as.Date(datex, "%d/%m/%y")
Ax <- rep("A1",18)
Bx <- rep(c(rep("B1",3),rep("B2",3)),3)
Cx <- rep(c("C1","C2","C3"),6)
valx <- 100
for(i in 1:17){valx[i+1] <- valx[i]+1}
data.train <- data.frame(datex, Ax, Bx, Cx, valx)
names.group <- names(data.train)[1:length(data.train)-1]
data.group <- Map(combn, list(names.group), seq_along(names.group), simplify = F) %>% unlist(recursive = F)
find.index <- sapply(data.group, function(x, find.y){
any(find.y %in% x)
}, find.y = c("datex"))
index.group <- NULL
for(i in 2:length(find.index)){
if(find.index[i] == "TRUE"){
index.group[i] <- i
}
}
index.group[is.na(index.group)] <- 0
for(i in 1:length(data.group)){
if(index.group[i] == 0){
data.group[[i]] <- 0
} else {
data.group[[i]] <- data.group[[i]]
}
}
data.group2 <- data.group[sapply(data.group, function(x) any(x != 0))]
combination.result <- lapply(data.group2, FUN = function(x) {
do.call(what = group_by_, args = c(list(data.train), x)) %>% summarise(sumVar = sum(valx))
})
combination.result
but i don't produce what i want. Thanks
You can generate for combinations of length 1 then for combinations of length 2. Use paste to create your Variable column. Then rbindlist all your results to get the final output.
library(data.table)
setDT(data.train)
sumCombi <- function(x, mySep="_") {
data.train[ , sum(Val), by=c("Date", x)][,
list(Date,
Variable=do.call(paste, c(.SD[,x,with=FALSE], list(sep=mySep))),
SumVal=V1)]
}
rbindlist(c(
#combinations with 1 element in each combi
lapply(c("A", "B", "C"), sumCombi)
,
#combinations with 2 elements in each combi
lapply(combn(c("A","B","C"), 2, simplify=FALSE), sumCombi)
), use.names=FALSE)
or more generically/programmatically:
#assuming that your columns are in the middle of the columns while excl. first and last columns
myCols <- names(data.train)[-c(1, ncol(data.train))]
rbindlist(unlist(
lapply(seq_along(myCols), function(n)
combn(myCols, n, sumCombi, simplify=FALSE)
), recursive=FALSE),
use.names=FALSE)

data.table operations with %dopar% are very slow

I run a loop over elements of list grouped_data_list using foreach and dopar.
The runtime is terribly slow, while workers are visibly busy.
If I make a vectorized routine with lapply, and without parallelling, this takes seconds. What is wrong with my dopar?
library(data.table)
library('doParallel') # parallel cpu implementation
library('foreach') # parallel looping
grouped_data_dt <- data.table(
Who=c("thdeg","mjg","dfdf","system","df","system","system","hegha","ydvw")
, DocumentExtension=c("jpg","com","dug","182","27","pdf","png","xslt","53")
, What_Action=c("added","removed","added","added","added","removed","added","added","added")
, Date=as.Date(c("2017-11-08","2017-10-10","2017-09-14","2017-09-20","2017-09-21","2017-10-20","2017-10-19","2017-08-24","2017-09-17"))
, Count=c(1,2,3,4,5,6,7,8,9)
)
reported_date_seq_dt <- data.table(
reported_date_seq = as.Date(c(
"2017-08-23","2017-08-24","2017-08-25","2017-08-26","2017-08-27","2017-08-28","2017-08-29","2017-08-30","2017-08-31","2017-09-01","2017-09-02"
,"2017-09-03","2017-09-04","2017-09-05","2017-09-06","2017-09-07","2017-09-08","2017-09-09","2017-09-10","2017-09-11","2017-09-12","2017-09-13"
,"2017-09-14","2017-09-15","2017-09-16","2017-09-17","2017-09-18","2017-09-19","2017-09-20","2017-09-21","2017-09-22","2017-09-23","2017-09-24"
,"2017-09-25","2017-09-26","2017-09-27","2017-09-28","2017-09-29","2017-09-30","2017-10-01","2017-10-02","2017-10-03","2017-10-04","2017-10-05"
,"2017-10-06","2017-10-07","2017-10-08","2017-10-09","2017-10-10","2017-10-11","2017-10-12","2017-10-13","2017-10-14","2017-10-15","2017-10-16"
,"2017-10-17","2017-10-18","2017-10-19","2017-10-20","2017-10-21","2017-10-22","2017-10-23","2017-10-24","2017-10-25","2017-10-26","2017-10-27"
,"2017-10-28","2017-10-29","2017-10-30","2017-10-31","2017-11-01","2017-11-02","2017-11-03","2017-11-04","2017-11-05","2017-11-06","2017-11-07"
,"2017-11-08","2017-11-09","2017-11-10","2017-11-11","2017-11-12","2017-11-13","2017-11-14","2017-11-15","2017-11-16","2017-11-17","2017-11-18"
,"2017-11-19","2017-11-20","2017-11-21","2017-11-22","2017-11-23","2017-11-24","2017-11-25","2017-11-26","2017-11-27"
))
)
grouped_data_list <-
split(x = grouped_data_dt
, drop = T
, by = c("Who", "DocumentExtension", "What_Action")
, sorted = T
, keep.by = T
)
cl <- makeCluster(4)
registerDoParallel(cl)
## replace NA with zeros in the timeseries
grouped_data_list_2 <- list()
foreach(
i = 1:length(grouped_data_list)
) %dopar%
{
x <- grouped_data_list[[i]]
data.table::setkey(x, Date)
dt_params <- unlist(
x[1, -c('Date', 'Count'), with = F]
)
y <- x[reported_date_seq_dt]
y[is.na(Count), (colnames(y)[!colnames(y) %in% c('Date', 'Count')]) := lapply(1:length(dt_params), function(x) dt_params[x])]
y[is.na(Count), Count := 0]
grouped_data_list_2 <- c(grouped_data_list_2
, list(y)
)
}
stopCluster(cl)
lapply routine:
## after grouped_data_list is created
rm(group_replace_func)
group_replace_func <- function(x)
{
setkey(x, Date)
dt_params <- unlist(
x[1, -c('Date', 'Count'), with = F]
)
y <- x[reported_date_seq_dt]
y[is.na(Count), (colnames(y)[!colnames(y) %in% c('Date', 'Count')]) := lapply(1:length(dt_params), function(x) dt_params[x])]
y[is.na(Count), Count := 0]
return(y)
}
grouped_data_list_2 <- lapply(
grouped_data_list
, group_replace_func
)
A new version that works fast (#Roland's advice):
## parallel work
cl <- makeCluster(4)
registerDoParallel(cl)
## replace NA with zeros in the timeseries
grouped_data_list_2 <- list()
grouped_data_list_2 <- foreach(
x = grouped_data_list
) %dopar%
{
data.table::setkey(x, Date)
dt_params <- unlist(
x[1, -c('Date', 'Count'), with = F]
)
y <- x[reported_date_seq_dt]
y[is.na(Count), (colnames(y)[!colnames(y) %in% c('Date', 'Count')]) := lapply(1:length(dt_params), function(x) dt_params[x])]
y[is.na(Count), Count := 0]
y
}
stopCluster(cl)

Resources