Return dataframe modified by function in R - r

I made a function which is searching for outliners in each row of dataframe. What i'd like to get at the end is modified dataframe with new column x$outliers_numb as return not as just print. I added return() function at the end but it doesn't work at all. Any ideas?
outliers <- function(x, s, e){
# x = dataframe
# s = index of first col to take
# e = index of last column to take
p <- x
for(i in s:e){
Q1 <- quantile(p[,i], 0.25, names = FALSE)
Q3 <- quantile(p[,i], 0.75, names = FALSE)
iqr <- IQR(p[,i])
low <- Q1 - iqr*1.5
up <- Q3 + iqr*1.5
p[,i] <- ((p[,i] < low) | (p[,i] > up))
}
p <- p %>% mutate(outliers_numb = rowSums(p[,s:e]))
x$outliers_numb <- p$outliers_numb
return(x)
}
#example
w <- data.frame(col1 = c(1, 2, 3, 4, 5, 90, 6),
col2 = c(13, 60, 13, 18, 13, 12, 0),
col3 = c(1, 899, 5, 4, 3, 8, 6))
outliers(w, 1, 3)

Just assign it to a new variable
dataframe_to_reus <- outliers(w, 1, 3)

Related

How to convert a for-loop to lapply function for parallel testing purposes?

I've been studying the advantages/disadvantages of for-loops versus versus the apply() family of functions and the answer isn't clear cut (apply() always faster than for-loops may not be true, depending on circumstances). So I want to test the various options against my actual data.
Below is a for-loop which looks pretty straightforward to me, but I'm unsure of how to replace it with lapply(). I assume lapply() is correct since the for-loop produces a list object.
The actual data I need to run this analysis against is a data frame containing 2.5 million rows, 30+ columns, so I'd like to run speed tests against the various options.
Any explanation would be most helpful. The examples I found online are light on explanations or the for-loops examples overly-complex, and I hope to learn to use apply() family functions well as they seem very useful and simpler to read than for-loops.
Here's the simplified for-loop code, with example data frame, which runs correctly for example purposes:
# Set up data frame to perform migration analysis on:
data <-
data.frame(
ID = c(1,1,1,2,2,2,3,3,3),
Period = c(1, 2, 3, 1, 2, 3, 1, 2, 3),
Values = c(5, 10, 15, 0, 2, 4, 3, 6, 9),
Flags = c("X0","X1","X2","X0","X2","X0", "X2","X1","X0")
)
# Function to set-up base table:
setTable <- function(data){
df <- data.frame(matrix(NA, ncol=length(unique(data$Flags)), nrow=length(unique(data$Flags))))
row.names(df) <- unique(data$Flags)
names(df) <- unique(data$Flags)
return(df)
}
# Function to complete migration table with for-loop:
migration <- function(data, from=1, to=3){
df <- setTable(data)
for (i in unique(data$ID)){
id_from <- as.character(data$Flags[(data$ID == i & data$Period == from)])
id_to <- as.character(data$Flags[data$ID == i & data$Period == to])
column <- which(names(df) == id_from)
row <- which(row.names(df) == id_to)
df[row, column] <- ifelse(is.na(df[row, column]), 1, df[row, column] + 1)
}
return(df)
}
# Now to run the function:
test1 <- migration(data, from=1, to=3)
Edit: wrapped in a function allowing to specify from & to:
library(data.table)
DF <- data.frame(
ID = c(1,1,1,2,2,2,3,3,3,4,4,4),
Period = c(1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3),
Values = c(5, 10, 15, 0, 2, 4, 3, 6, 9, 3, 6, 9),
Flags = c("X0","X1","X2","X0","X2","X0", "X2","X1","X0", "X2","X1","X0")
)
migration <- function(DT, from=1, to=3){
setDT(DT)
unique_flags <- unique(DT$Flags)
all_flags <- setDT(expand.grid(list(from_flag = unique_flags, to_flag = unique_flags)))
dcast(DT[, .(from_flag = Flags[Period == from], to_flag = Flags[Period == to]), by = ID][
,.N, c("from_flag", "to_flag")][
all_flags, on = c("from_flag", "to_flag")], to_flag ~ from_flag, value.var = "N")
}
migration(DF, 1, 3)
When it comes to speed in R, you can almost always count on library(data.table):
library(data.table)
DT <- setDT(data.frame(
ID = c(1,1,1,2,2,2,3,3,3,4,4,4),
Period = c(1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3),
Values = c(5, 10, 15, 0, 2, 4, 3, 6, 9, 3, 6, 9),
Flags = c("X0","X1","X2","X0","X2","X0", "X2","X1","X0", "X2","X1","X0")
))
unique_flags <- unique(DT$Flags)
all_flags <- setDT(expand.grid(list(first_flag = unique_flags, last_flag = unique_flags)))
resultDT <- dcast(DT[, .(first_flag = first(Flags), last_flag = last(Flags)), by = ID][
,.N, c("first_flag", "last_flag")][
all_flags, on = c("first_flag", "last_flag")], last_flag ~ first_flag, value.var = "N")
print(resultDT)
Step by step:
library(data.table)
DT <- setDT(data.frame(
ID = c(1,1,1,2,2,2,3,3,3,4,4,4),
Period = c(1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3),
Values = c(5, 10, 15, 0, 2, 4, 3, 6, 9, 3, 6, 9),
Flags = c("X0","X1","X2","X0","X2","X0", "X2","X1","X0", "X2","X1","X0")
))
unique_flags <- unique(DT$Flags)
all_flags <- setDT(expand.grid(list(first_flag = unique_flags, last_flag = unique_flags)))
resultDT <- DT[, .(first_flag = first(Flags), last_flag = last(Flags)), by = ID] # find relevant flags
resultDT <- resultDT[,.N, c("first_flag", "last_flag")] # count transitions
resultDT <- resultDT[all_flags, on = c("first_flag", "last_flag")] # merge all combinations
resultDT <- dcast(resultDT, last_flag ~ first_flag, value.var = "N") # dcast
print(resultDT)
Regarding lapply you can do (I'd prefer data.table):
# Set up data frame to perform migration analysis on:
input_data <-
data.frame(
ID = c(1,1,1,2,2,2,3,3,3),
Period = c(1, 2, 3, 1, 2, 3, 1, 2, 3),
Values = c(5, 10, 15, 0, 2, 4, 3, 6, 9),
Flags = c("X0","X1","X2","X0","X2","X0", "X2","X1","X0")
)
# Function to set-up base table:
setTable <- function(data){
DF <- data.frame(matrix(NA, ncol=length(unique(data$Flags)), nrow=length(unique(data$Flags))))
row.names(DF) <- unique(data$Flags)
names(DF) <- unique(data$Flags)
return(DF)
}
# Function to complete migration table with for-loop:
migration <- function(data, from=1, to=3){
DF <- setTable(data)
lapply(seq_along(unique(data$ID)), function(i){
id_from <- as.character(data$Flags[(data$ID == i & data$Period == from)])
id_to <- as.character(data$Flags[data$ID == i & data$Period == to])
column <- which(names(DF) == id_from)
row <- which(row.names(DF) == id_to)
DF[row, column] <<- ifelse(is.na(DF[row, column]), 1, DF[row, column] + 1)
})
return(DF)
}
# Now to run the function:
test1 <- migration(input_data, from=1, to=3)

Search a vector in a specific element of a nested list in R

To explain what I want to do exactly, I will use the following example:
a = list(x = 5, y = c(11, 12, 13))
b = list(x = 4.7, y = c(112, 5, 2))
c = list(x = 77, y = c(5, 1, 1))
d = list(x = 5, y = c(22, 11, 43))
test_list = list(a, b, c, d)
I have a nested list: test_list. I would like to search vector 5 only in element x in the tested_list, and return the indices of the list, e.g., here as c(1,4).
Thanks a lot.
I would try with lapply like here:
a = list(x = 5, y = c(11, 12, 13))
b = list(x = 4.7, y = c(112, 5, 2))
c = list(x = 77, y = c(5, 1, 1))
d = list(x = 5, y = c(22, 11, 43))
test_list = list(a, b, c, d)
which(unlist(lapply(test_list, function(x) {
x$x == 5
})))
First you choose x then for 5 then unlist and then check which are TRUE.
Try:
which(vapply(test_list, function(x) x[["x"]] == 5, logical(1)))
Similarly, using purrr:
which(map_lgl(test_list, ~ pluck(., "x") == 5))
[1] 1 4
As 'x' is of length 1 in each list element, it may be better to do the comparison at once after extracting the element
which(sapply(test_list, `[[`, 'x')==5)
#[1] 1 4

Combine two list with equal element names into list of lists by element name R

I have two lists, one with matrices and one with vectors. Both have named elements which are equal. This looks like this:
set.seed(1)
mat_list <- list("2009" = matrix(runif(n = 9, min = 0, max = 10), 3, 3),
"2010" = matrix(runif(n = 9, min = 0, max = 10), 3, 3))
vec_list <- list("2009" = c(runif(n = 3, min = 0, max = 10)),
"2010" = c(runif(n = 3, min = 0, max = 10)))
What I want is to create a new list with the elements 2009 and 2010 that contains the respective matrix and vector, so that I can acces them both in a lapply call. My actual dat has some more years, so I would be nice to not have to reference the years explicitly.
I found a bunch of similar questions, but I couldn't figure out how to apply the answers to my situation. Thanks!
With the purrr package and the map2 function, you can do this:
#install_packages("purrr")
set.seed(1)
mat_list <- list("2009" = matrix(runif(n = 9, min = 0, max = 10), 3, 3),
"2010" = matrix(runif(n = 9, min = 0, max = 10), 3, 3))
vec_list <- list("2009" = c(runif(n = 3, min = 0, max = 10)),
"2010" = c(runif(n = 3, min = 0, max = 10)))
l <- purrr::map2(mat_list, vec_list, function(x,y) list(x,y))
#l <- purrr::map2(mat_list, vec_list, ~list(.x,.y)) #shorter notation
#l <- purrr::map2(mat_list, vec_list, list) #even shorter
#x and y inside the map2 are the elements of each list at each iteration,
#so we can combine them in a list
Thanks to #markus:
l <- Map(list, mat_list, vec_list) # no need for another package

R - removing data table rows based on two values

I have a large data frame (tbl_df) with approximately the following information:
data <- data.frame(Energy = sample(1:200, 100, replace = T), strip1 = sample(1:12, 100, replace = T), strip2 = sample(1:12, 100, replace = T))
It has 3 columns. The first is energy, the second and third are strip numbers (where energy was deposited).
Each strip has a different threshold and these are stored in two numeric arrays, each position in the array is for the corresponding strip number:
threshold_strip1 <- c(4, 6, 3, 7, 7, 1, 2, 5, 8, 10, 2, 2)
threshold_strip2 <- c(5, 3, 5, 7, 6, 2, 7, 7, 10, 2, 2, 2)
These tell me the minimum amount of energy the strip can receive. What I want to be able to do is remove the rows from the data frame where BOTH strips do not have over the required threshold.
As an example, if I have the row:
Energy = 4, strip1 = 2, strip2 = 2
Then I would remove this row as although strip2 has a lower threshold than 4, strip1 has a threshold of 6 and so there isn't enough energy here.
Apologies if this question is worded poorly, I couldn't seem to find anything like it in old questions.
filter1 <- data$strip1 >= threshold_strip1[data$strip1]
filter2 <- data$strip2 >= threshold_strip1[data$strip2]
data <- subset(data, filter1 & filter2)
I'd maybe do...
library(data.table)
setDT(data)
# structure lower-bound rules
threshes = list(threshold_strip1, threshold_strip2)
lbDT = data.table(
strip_loc = rep(seq_along(threshes), lengths(threshes)),
strip_num = unlist(lapply(threshes, seq_along)),
thresh = unlist(threshes)
)
# loop over strip locations (strip1, strip2, etc)
# marking where threshold is not met
data[, keep := TRUE]
lbDT[, {
onexpr = c(sprintf("strip%s==s", strip_loc), "Energy<th")
data[.(s = strip_num, th = thresh), on=onexpr, keep := FALSE]
NULL
}, by=strip_loc]
What about this? Using dplyr:
require(dplyr)
data2 <- data %>%
mutate(
strip1_value = threshold_strip1[strip1],
strip2_value = threshold_strip2[strip2],
to_keep = Energy > strip1_value & Energy > strip2_value
) %>%
filter(to_keep == TRUE)

Divide vector with grouping vector

I have two vectors, which I would like to combine in one dataframe. One of the vectors values needs to be divided into two columns. The second vector nc informs about the number of values for each observation. If nc is 1, only one value is given in values (which goes into val1) and 999 is to be written in the second column (val2).
What is an r-ish way to divide vector value and populate the two columns of df? I suspect I miss something very obvious, but can't proceed at the moment...Many thanks!
set.seed(123)
nc <- sample(1:2, 10, replace = TRUE)
value <- sample(1:6, sum(nc), replace = TRUE)
# result by hand
df <- data.frame(nc = nc,
val1 = c(6, 3, 4, 1, 2, 2, 6, 5, 6, 5),
val2 = c(999, 5, 999, 6, 1, 999, 6, 4, 4, 999))
Here's an approach based on this answer:
set.seed(123)
nc <- sample(1:2, 10, replace = TRUE)
value <- sample(1:6, sum(nc), replace = TRUE)
splitUsing <- function(x, pos) {
unname(split(x, cumsum(seq_along(x) %in% cumsum(replace(pos, 1, pos[1] + 1)))))
}
combineValues <- function(vals, nums) {
mydf <- data.frame(cbind(nums, do.call(rbind, splitUsing(vals, nums))))
mydf$V3[mydf$nums == 1] <- 999
return(mydf)
}
df <- combineValues(value, nc)
I think this is what you are looking for. I'm not sure it is the fastest way, but it should do the trick.
count <- 0
for (i in 1:length(nc)) {
count <- count + nc[i]
if(nc[i]==1) {
df$val1[i] <- value[count]
df$val2[i] <- 999
} else {
df$val1[i] <- value[count-1]
df$val2[i] <- value[count]
}
}

Resources