optimize call function for each row of data table - r

We need to optimize a program will perform a treatment on each "nb_student"
depending on the value number, generate a list number corresponding to the value entered.
Once this list is obtained, another program will have to count according to a ranking rule.
ranking rule
if number of student :
is less than 1 => increment group A
is between 1 and 3 => increment group B
is between 3 and 4 => increment group C
is greater than 4 => increment group D
initial data
"category_name" "nb_student"
A 6,00000
A 10,00000
B 12,0000
C 74,0000
D 6,00000
init data code
DT = data.table(
category_name = c("A","B","C","D"),
nb_student = c(6,12,74,6)
)
function for each row
treatment_group <- function(nb_student){
nb_group_A = nb_group_B = nb_groupe_C = nb_groupe_D <- 0
limit_1 <- 1
limit_2 <- 3
limit_3 <- 4
list <- runif(nb_student, 0, 5)
for (i in list) {
if(i < limit_1){
nb_group_A <- nb_group_A + 1
}else if(i > limit_1 & i < limit_2){
nb_group_B <- nb_group_B + 1
}else if(i > limit_3){
nb_groupe_C <- nb_groupe_C + 1
}else {
nb_groupe_D <- nb_groupe_D + 1
}
}
list(nb_group_A, nb_group_B, nb_groupe_C, nb_groupe_D)
}
result
DT[ , c("group A", "group B", "group C", "group D") := tratment_group(nb_student), by = seq_len(nrow(DT))]
The final result must match this table
"category_name" "nb_student" "group A" "group B" "group C" "group D"
A 6,00000 0,00000 2,00000 4,00000 0,00000
A 10,00000 3,00000 3,00000 4,00000 0,00000
B 12,0000 2,00000 9,00000 0,00000 1,00000
C 74,0000 14,0000 29,0000 15,0000 16,0000
D 6,00000 0,00000 1,00000 3,00000 2,00000
this code works, but i want to optimize it to run with 200000 rows. Maybe using parallelization ?

I guess you can try findInterval
set.seed(1)
DT[
,
c(
.SD,
as.data.frame(
t(as.matrix(table(
factor(
findInterval(runif(nb_student, 0, 5), c(1, 3, 4)) + 1,
levels = 1:4,
label = paste("group", LETTERS[1:4])
)
)))
)
),
category_name
]
which gives
category_name nb_student group A group B group C group D
1: A 6 0 4 0 2
2: B 12 2 3 5 2
3: C 74 11 35 17 11
4: D 6 0 2 3 1

Related

Loop within a loop with column names in R

I have the following data:
id A B C
1 1 1 0
2 1 1 1
3 0 1 1
I will like to create a function that computes the following three information between columns:
the number of individuals i) with A and B, ii) with A but not B, iii) B but not A. Similarly, I will like a recursive loop that computes these three numbers for A and C, and B and C. Is there a smart way to do so? a loop within a loop? So far, I have tried the following:
for(ii in colnames(df)){
for(jj in (ii+1):df){
print(ii,jj)
}}
Perhaps something like this:
# function to return your metrics
foo = function(x, y) {
c(
"x and y" = sum(x & y),
"x not y" = sum(x & !y),
"y not x" = sum(!x & y)
)
}
# generate combinations of columns
col_combos = combn(names(df)[-1], 2)
result = apply(col_combos, 2, function(x) foo(df[[x[1]]], df[[x[2]]]))
colnames(result) = apply(col_combos, 2, toString)
result
# A, B A, C B, C
# x and y 2 1 2
# x not y 0 1 1
# y not x 1 1 0
Using this data:
df = read.table(text = 'id A B C
1 1 1 0
2 1 1 1
3 0 1 1 ', header = TRUE)

I have a sample dataset , which has missing values in it

I have a sample dataset , which has missing values in it.I want to create a new column with a message of different combinations where it should tell which columns values are missing.
Example:
Dataset:
A B C D
1 2 4
4 4
4 1
3 2 3
The permutaions of the above data set is :
"a" ,"b","c","d" ,"a, b","a, c" ,"a, d" , "b, c","b, d","c, d" , "a, b, c","a, b, d","a, c, d","b, c, d","a, b, c, d"
Result:
A B C D Message
1 2 4 Column B is missing
4 4 column A and D is Missing
4 1 Column C and D is Missing
All column values are missing
3 2 3 Column B is Missing
Any suggestion would be really appreciated
Here's a way using apply from base R -
set.seed(4)
df <- data.frame(matrix(sample(c(1:5, NA), 15, replace = T), ncol = 3))
names(df) <- LETTERS[1:3]
df$msg <- apply(df, 1, function(x) {
if(anyNA(x)) {
paste0(paste0(names(x)[which(is.na(x))], collapse = " "), " missing", collapse = "")
} else {
"No missing"
}
})
df
A B C msg
1 4 2 5 No missing
2 1 5 2 No missing
3 2 NA 1 B missing
4 2 NA NA B C missing
5 5 1 3 No missing

R: reshape data frame when one column has unequal number of entries

I have a data frame x with 2 character columns:
x <- data.frame(a = numeric(), b = I(list()))
x[1:3,"a"] = 1:3
x[[1, "b"]] <- "a, b, c"
x[[2, "b"]] <- "d, e"
x[[3, "b"]] <- "f"
x$a = as.character(x$a)
x$b = as.character(x$b)
x
str(x)
The entries in column b are comma-separated strings of characters.
I need to produce this data frame:
1 a
1 b
1 c
2 d
2 e
3 f
I know how to do it when I loop row by row. But is it possible to do without looping?
Thank you!
Have you checked out require(splitstackshape)?
> cSplit(x, "b", ",", direction = "long")
a b
1: 1 a
2: 1 b
3: 1 c
4: 2 d
5: 2 e
6: 3 f
> s <- strsplit(as.character(x$b), ',')
> data.frame(value=rep(x$a, sapply(s, FUN=length)),b=unlist(s))
value b
1 1 a
2 1 b
3 1 c
4 2 d
5 2 e
6 3 f
there you go, should be very fast:
library(data.table)
x <- data.table(x)
x[ ,strsplit(b, ","), by = a]

Take certain value in a data frame

I have a data.frame and would like to take a certain value from a cell if another is in a dataframe.
I tried the apply function.
n <- c(2, 3, 0 ,1)
s <- c(0, 1, 1, 2)
b <- c("THIS", "FALSE", "NOT", "THIS")
df <- data.frame(n, s, b)
df <- sapply(df$Vals, FUN=function(x){ if(b[x]=="THIS") ? n[x] : s[x] } )
My logic is:
if(b at position x is equal to "This") {
add n[x] to the column df$Vals
} else {
add s[x] to the column df$Vals
}
Whereas x is a single row.
Any recommendation what I am doing wrong?
I appreciate your reply!
Like this:
df$Vals = with(df, ifelse(b=="THIS", n, s))
Or giving direct the resulting data.frame:
transform(df, Vals=with(df, ifelse(b=="THIS", n, s)))
# n s b Vals
#1 2 0 THIS 2
#2 3 1 FALSE 1
#3 0 1 NOT 1
#4 1 2 THIS 1
With your additional conditions:
func=Vectorize(function(b, s, n){if(b=='THIS') return(n);if(b==F) return(n+s);s})
df$Vals = with(df, func(b,s,n))
Or you could use the row/column indexing
df$Vals <- df[1:2][cbind(1:nrow(df),(df$b!='THIS')+1)]
df
# n s b Vals
#1 2 0 THIS 2
#2 3 1 FALSE 1
#3 0 1 NOT 1
#4 1 2 THIS 1

How can I build an inverted index from a data frame in R?

Say I have a data frame in R : data.frame(x=1:4, y=c("a b c", "b", "a c", "c"))
x y
1 1 a b c
2 2 b
3 3 a c
4 4 c
Now I want to build a new data frame, an inverted index which is quite common in IR or recommendation systems, from it:
y x
a 1 3
b 1 2
c 1 3 4
How can I do this in an efficient way?
conv <- function(x) {
l <- function(z) {
paste(x$x[grep(z, x$y)], collapse=' ')
}
lv <- Vectorize(l)
alphabet <- unique(unlist(strsplit(as.character(x$y), ' '))) # hard-coding this might be preferred for some uses.
y <- lv(alphabet)
data.frame(y=names(y), x=y)
}
x <- data.frame(x=1:4, y=c("a b c", "b", "a c", "c"))
> conv(x)
## y x
## a a 1 3
## b b 1 2
## c c 1 3 4
An attempt, after converting y to characters:
test <- data.frame(x=1:4,y=c("a b c","b","a c","c"),stringsAsFactors=FALSE)
result <- strsplit(test$y," ")
result2 <- sapply(unique(unlist(result)),function(y) sapply(result,function(x) y %in% x))
result3 <- apply(result2,2,function(x) test$x[which(x)])
final <- data.frame(x=names(result3),y=sapply(result3,paste,collapse=" "))
> final
x y
a a 1 3
b b 1 2
c c 1 3 4
quick and dirty
original.df <- data.frame(x=1:4, y=c("a b c", "b", "a c", "c"))
original.df$y <- as.character(original.df$y)
y.split <- strsplit(original.df$y, " ")
y.unlisted <- unique(unlist(y.split))
new.df <-
sapply(y.unlisted, function(element)
paste(which(sapply(y.split, function(y.row) element %in% y.row)), collapse=" " ))
as.data.frame(new.df)
> new.df
a 1 3
b 1 2
c 1 3 4

Resources