dplyr pipeline in a function

dplyr pipeline in a function - r

I'm trying to put a dplyr pipeline in a function but after reading the vignette multiple times as well as the tidy evaluation (https://tidyeval.tidyverse.org/dplyr.html).
I still can't get it to work...
#Sample data:
dat <- read.table(text = "A ID B
1 X 83
2 X NA
3 X NA
4 Y NA
5 X 2
6 Y 2
12 Y 10
7 Y 18
8 Y 85", header = TRUE)
# What I'm trying to do:
x <- dat %>% filter(!is.na(B)) %>% count('ID') %>% filter(freq>3)
x$ID
# Now in a function:
n_occurences <- function(df, n, column){
# Group by ID and return IDs with number of non-na > n in column
column <- enquo(column)
x <- df %>%
filter(!is.na(!!column)) %>%
count('ID') %>% filter(freq>n)
x$ID
}
# Let's try:
col <- 'B'
n_occurences(dat, n=3, column = col)
There is no error, but the output is wrong. This as something to do with the tidy evaluation, but I just can't get my head around it.

With rlang_0.40, we can do this much easier by using the {{...}} or curly-curly operator
library(rlang)
library(dplyr)
n_occurences <- function(df, n1, column){
df %>%
filter(!is.na({{column}})) %>%
count(ID) %>%
filter(n > n1) %>%
pull(ID)
}
n_occurences(dat, n1 = 3, column = B)
#[1] Y
#Levels: X Y
If we intend to pass a quoted string, convert it to symbol (sym) and then do the evaluation (!!)
n_occurences <- function(df, n1, column){
column <- rlang::sym(column)
df %>%
filter(!is.na(!!column)) %>%
count(ID) %>%
filter(n > n1) %>%
pull(ID)
}
col <- 'B'
n_occurences(dat, n1=3, column = col)
#[1] Y
#Levels: X Y

Related

calculating the duration and the order of non-continuous events in R

My dataset consists of a series of behaviours observed in videos. For each behaviour, I have recorded when it starts and when it ends.
datain <-data.frame(
A=c("1/5+11/18","0/5","7/10"),
B=c("6/10+19/25","11/15","11/20"),
C=c("26/30","6/10","0/6"))
I would like to get the duration of each behaviour as well as the order of the behaviours for each observation, like in this desired output
dataout <-data.frame(
A=c("1/5+11/18","0/5","7/10"),
B=c("6/10+19/25","11/15","11/20"),
C=c("26/30","6/10","0/6"),
A.sum=c(11,5,3),
B.sum=c(10,4,9),
C.sum=c(4,4,6),
myorder=c("A/B/A/B/C","A/C/B","C/A/B"))
I am experimenting with the following lines to identify which columns have the + and to extract the rows with the interrupted behaviours (but I still have to calculate the duration of each behaviour), but I guess there could be more efficient solution than the one I am currently attempting.
d.1 <- lapply(datain, function(x) str_which(x,"\\+"))
d.2 <- which(lapply(d.1,length)>0)
coltosum <- match(names(d.2),colnames(datain))
mylist <- lapply(datain[coltosum],function(x) strsplit(x,"\\+"))
As always, I would greatly appreciate any suggestion.
Please note that I have edited this question after some days to include in the desired output the order of the behaviours.
Update: I have been able to figure out how to get the sequence of the behaviours. I bet there are more elegant and concise ways to get this result. Below the code
#removing empty columns
empty_columns <- sapply(datain, function(x) all(is.na(x) | x == ""))
datain<- datain[, !empty_columns]
#loop 1#
#this loop is for taking the occurrence of BH
mylist <- list()
for (i in seq(1,nrow(datain))){
mylist <- apply(datain,1,str_extract_all,pattern="\\d+")
myindx <- sapply(mylist, length)
myres <- c(do.call(cbind,lapply(mylist, `length<-`,max(myindx))))
names(myres) <- rep(colnames(datain),nrow(datain))
mydf <- ldply(myres,data.frame)
colnames(mydf) <- c("BH","values")
}
#loop 2#
#this loop is for counting the number of elements in a nested list
mydf.1 <- list()
myres.2 <- list()
for (i in seq(1,nrow(datain))){
mydf.1 <- length(unlist(mylist[i]))
myres.2[i] <- mydf.1
}
#this is for placing the row values
names(myres.2) <- rownames(datain)
myres.3 <- as.numeric(myres.2)
mydf$myrow <- c(rep(rownames(datain),myres.3))
#I can order by row and by values
mydf <- mydf[order(as.numeric(mydf$myrow),as.numeric(mydf$values)),]
#I have to pick up the right values
#I have to generate as many sequences as many elements for each row.
myseq <- sequence(myres.3)
mydf <- cbind(mydf,myseq)
myseq.2 <- seq(1,nrow(mydf),by=2)
#selecting the df according to the uneven row
mydf.1 <- mydf[myseq.2,]
myorder <-split(mydf.1,mydf.1$myrow)
#loop 3
myres.3 <- list()
for (i in seq(1,nrow(datain))){
myres.3 <- lapply(myorder,"[",i=1)
}
myorder.def <- data.frame(cbind(lapply(myres.3,paste0,collapse="/")))
colnames(myorder.def) <- "BH"
#last step, apply str_extract_all for each row
myorder.def$BH <- str_replace_all(myorder.def$BH,"c","")
myorder.def$BH <- str_replace_all(myorder.def$BH,"\\(","")
myorder.def$BH <- str_replace_all(myorder.def$BH,"\\)","")
myorder.def$BH <- str_replace_all(myorder.def$BH,"\"","")
myorder.def$BH <- str_replace_all(myorder.def$BH,", ","/")
data.out <- cbind(datain,myorder.def)
data.out
Stef

An option in base R would be to loop over the columns (lapply) of the dataset, then replace the digits (\\d+) followed by / and digits to denominator - numerator by capturing those digits and switching the backreferences (\\2-\\1), and eval(parse the string
datain[paste0(names(datain), ".sum")] <- lapply(datain, function(y)
sapply(gsub("(\\d+)/(\\d+)", "(\\2-\\1)", y),
function(x) eval(parse(text = x))))
-checking with OP's output
> datain
A B C A.sum B.sum C.sum
1 3/4+6/8+11/16 0/5+15/20 0/5 8 10 5
2 0/5 5/10 3/10 5 5 7
> dataout
A B C A.sum B.sum C.sum
1 3/4+6/8+11/16 0/5+10/5 0/5 8 10 5
2 0/5 5/10 3/10 5 5 7
Or with tidyverse, group by rows, loop across all the columns, read the string into a data.frame with read.table, subtract the columns, get the sum and return as new columns by modifying the .names
library(dplyr)
library(stringr)
datain %>%
rowwise %>%
mutate(across(everything(), ~ sum(with(read.table(text =
str_replace_all(.x, fixed("+"), "\n"), sep = "/",
header = FALSE), V2 - V1)), .names = "{.col}.sum")) %>%
ungroup
-output
# A tibble: 2 × 6
A B C A.sum B.sum C.sum
<chr> <chr> <chr> <int> <int> <int>
1 3/4+6/8+11/16 0/5+15/20 0/5 8 10 5
2 0/5 5/10 3/10 5 5 7

Another base R approach might be the following. First split by +, then split again by /, taking the sum of differences in the resulting values.
datain[paste0(names(datain), ".sum")] <-
lapply(datain, function(x) {
sapply(strsplit(x, "[+]"), function(y) {
sum(sapply(strsplit(y, "[/]"), function(z) {
diff(as.numeric(z)) }
))
})
})
datain
Output
A B C A.sum B.sum C.sum
1 3/4+6/8+11/16 0/5+15/20 0/5 8 10 5
2 0/5 5/10 3/10 5 5 7

Update:
Slightly improved:
library(dplyr)
library(tidyr)
library(data.table)
datain %>%
pivot_longer(everything()) %>%
separate_rows(value, sep = "\\+|\\/", convert = TRUE) %>%
group_by(group = rleid(name)) %>%
mutate(value = value - lag(value, default = value[1])) %>%
slice(which(row_number() %% 2 == 0)) %>%
mutate(value = sum(value),
name = paste0(name, ".sum")) %>%
slice(1) %>%
ungroup() %>%
select(-group) %>%
group_by(name) %>%
mutate(id = row_number()) %>%
pivot_wider(names_from = name, values_from = value) %>%
select(-id) %>%
cbind(datain)
This row
separate_rows(value, sep = "\\+|\\/", convert = TRUE) %>%
is same as
separate_rows(value, sep = "\\+") %>%
separate_rows(value, sep = "\\/") %>%
type.convert(as.is = TRUE) %>%
The very very long way until finish: :-)
library(dplyr)
library(tidyr)
library(data.table)
datain %>%
pivot_longer(everything()) %>%
separate_rows(value, sep = "\\+") %>%
separate_rows(value, sep = "\\/") %>%
group_by(group =as.integer(gl(n(),2,n()))) %>%
type.convert(as.is = TRUE) %>%
mutate(x = value - lag(value, default = value[1])) %>%
ungroup() %>%
group_by(group = rleid(name)) %>%
mutate(x = sum(x)) %>%
mutate(labels = paste0(name, ".sum")) %>%
slice(1) %>%
ungroup() %>%
select(-c(name, group, value)) %>%
pivot_wider(names_from = labels,
values_from = x,
values_fn = list) %>%
unnest(cols = c(A.sum, B.sum, C.sum)) %>%
cbind(datain)
A.sum B.sum C.sum A B C
1 8 10 5 3/4+6/8+11/16 0/5+15/20 0/5
2 5 5 7 0/5 5/10 3/10

How to use grep or any other method to compare different no of row in two data frame and get the match and mismatch?

Below are my two dataframe:
ABData1 <- data.frame(id=c(11,12,13,14,15),
a = c(1,2,3,4,5))
ABData2 <- data.frame(id=c(11,12,13,14),
b = c(1,4,3,4))
how to compare these two dataframe for matching rows and mismatch rows
if 1st row of ABData1 of a is matching with 1st row of ABData2 of b is matching then show as match and else show as mismatch and then goes to 2nd row....all the comparison will rowwise.
i have tried below code which is working fine for one data frame but its trowing error because of different rows in two data frames.
ABData <- data.frame(a = c(1,2,2,1,1),
b = c(1,2,1,1,2))
match<- ABData %>% rowwise() %>% filter(grepl(a,b, fixed = TRUE))
mismatch<- ABData %>% rowwise() %>% filter(!grepl(a,b))
I am expecting below output
Expected match Output:
id a expected b
11 1 1 1
13 3 3 3
14 4 4 4
Expected mismatch output:
id a expected b
12 2 2 4
15 NA NA 5
Thanks in advance.

You can use this:
ABData1 <- data.frame(a = c(1,2,3,4,5))
ABData2 <- data.frame(b = c(1,4,3,4))
equLength <- function(x, y) {
if (length(x)>length(y)) length(y) <- length(x) else length(x) <- length(y)
data.frame(a=x, b=y)
}
ABData <- equLength(ABData1$a, ABData2$b)
... and then use your working code for one dataframe.
library("dplyr")
resultMatch <- ABData %>% rowwise() %>% filter(grepl(a,b, fixed = TRUE))
resultMismatch <- ABData %>% rowwise() %>% filter(!grepl(a,b))
For the extended question:
library("dplyr")
ABData1 <- data.frame(id=c(11,12,13,14,15), a = c(1,2,3,4,5))
ABData2 <- data.frame(id=c(11,12,13,14), b = c(1,4,3,4))
equLength <- function(x, y) {
if (length(x)>length(y)) length(y) <- length(x) else length(x) <- length(y)
data.frame(a=x, b=y)
}
if (nrow(ABData1)>nrow(ABData2)) ABData <- data.frame(ABData1, b=equLength(ABData1$a, ABData2$b)$b) else
ABData <- data.frame(ABData2, a=equLength(ABData1$a, ABData2$b)$a)
resultMatch <- ABData %>% rowwise() %>% filter(grepl(a,b, fixed = TRUE))
resultMismatch <- ABData %>% rowwise() %>% filter(!grepl(a,b))

How to use apply functions correctly when there are NA values

I'd like to calculate a function on multiple columns of a dataframe with random NA values. I have two questions:
How to deal with NAs? The code runs when I try it on non-NA columns, but returns NA when there are NAs even though I remove them.
How to print the results in a dataframe format instead of multiple arrays? I used mapply but it doesn't seem to do the calculations correctly.
Here is my code:
#create a data frame with random NAs
df<-data.frame(category1 = sample(c(1:10),100,replace=TRUE),
category2 = sample(c(1:10),100,replace=TRUE)
)
insert_nas <- function(x) {
len <- length(x)
n <- sample(1:floor(0.2*len), 1)
i <- sample(1:len, n)
x[i] <- NA
x
}
df <- sapply(df, insert_nas) %>% as.data.frame()
df$type <- sample(c("A", "B", "C"),100,replace=TRUE)
#using apply:
library(NPS)
apply(df[,c('category1', 'category2')], 2,
function(x) df %>% filter(!is.na(x)) %>% group_by(type) %>%
transmute(nps(x)) %>% unique()
)
#results:
$category1
# A tibble: 3 x 2
# Groups: type [3]
type `nps(x)`
<chr> <dbl>
1 B NA
2 A NA
3 C NA
...
#using mapply
mapply(function(x) df %>% filter(!is.na(x)) %>% group_by(type) %>%
transmute(nps(x)) %>% unique(), df[,c('category1', 'category2')])
#results:
category1 category2
type Character,3 Character,3
nps(x) Numeric,3 Numeric,3
Regarding the function I use, it doesn't have a built in way to deal with NAs, so I remove NAs prior to calling it.

I still used the !is.na part of your code because it seems that nps can't deal with NA, even though the documentation said it should (possible bug). I changed your apply to lapply and passed the variables as the list. Then I used get to identify the variable name that appears in quotes as a variable in your df.
df<-data.frame(category1 = sample(c(1:10),100,replace=TRUE),
category2 = sample(c(1:10),100,replace=TRUE)
)
insert_nas <- function(x) {
len <- length(x)
n <- sample(1:floor(0.2*len), 1)
i <- sample(1:len, n)
x[i] <- NA
x
}
df <- sapply(df, insert_nas) %>% as.data.frame()
df$type <- sample(c("A", "B", "C"),100,replace=TRUE)
#using apply:
library(NPS)
df2 <- as.data.frame(lapply(c('category1', 'category2'),
function(x) df %>% filter(!is.na(get(x))) %>% group_by(type) %>%
transmute(nps(get(x))) %>% unique()
),stringsAsFactors = FALSE)
colnames(df2) <- c("type", "nps_cat1","type2","nps_cat2")
#type2 is redundant
df2 <- select(df2, -type2)

Apply function over data frame rows

I'm trying to apply a function over the rows of a data frame and return a value based on the value of each element in a column. I'd prefer to pass the whole dataframe instead of naming each variable as the actual code has many variables - this is a simple example.
I've tried purrr map_dbl and rowwise but can't get either to work. Any suggestions please?
#sample df
df <- data.frame(Y=c("A","B","B","A","B"),
X=c(1,5,8,23,31))
#required result
Res <- data.frame(Y=c("A","B","B","A","B"),
X=c(1,5,8,23,31),
NewVal=c(10,500,800,230,3100)
)
#use mutate and map or rowwise etc
Res <- df %>%
mutate(NewVal=map_dbl(.x=.,.f=FnAdd(.)))
Res <- df %>%
rowwise() %>%
mutate(NewVal=FnAdd(.))
#sample fn
FnAdd <- function(Data){
if(Data$Y=="A"){
X=Data$X*10
}
if(Data$Y=="B"){
X=Data$X*100
}
return(X)
}

If there are multiple values, it is better to have a key/val dataset, join and then do the mulitiplication
keyVal <- data.frame(Y = c("A", "B"), NewVal = c(10, 100))
df %>%
left_join(keyVal) %>%
mutate(NewVal = X*NewVal)
# Y X NewVal
#1 A 1 10
#2 B 5 500
#3 B 8 800
#4 A 23 230
#5 B 31 3100
It is not clear how many unique values are there in the actual dataset 'Y' column. If we have only a few values, then case_when can be used
FnAdd <- function(Data){
Data %>%
mutate(NewVal = case_when(Y == "A" ~ X * 10,
Y == "B" ~ X *100,
TRUE ~ X))
}
FnAdd(df)
# Y X NewVal
#1 A 1 10
#2 B 5 500
#3 B 8 800
#4 A 23 230
#5 B 31 3100

You were originally looking for a solution using dplyr's rowwise() function, so here is that solution. The nice thing about this approach is that you don't need to create a separate function.
Here's the version using if()
df %>%
rowwise() %>%
mutate(NewVal = ifelse(Y == "A", X * 10,
ifelse(Y == "B", X * 100)))
and here's the version using case_when:
df %>%
rowwise() %>%
mutate(NewVal = case_when(Y == "A" ~ X * 10,
Y == "B" ~ X * 100))

How to use function arguement to set column name

I have dataframe df as follows:
df <- data.frame(x = c("A", "A", "B", "B"), y = 1:4)
And I have a function that finds the mean of y grouped by x:
generateVarMean <- function(df, x, y) {
mean.df <- df %>%
select(x, y) %>%
group_by(x) %>%
dplyr::summarise(variable.mean = mean(y, na.rm = TRUE))
colnames(mean.df)[2] <- paste0("y", ".mean")
print(mean.df)
}
However, I want to the first argument of paste0() to reflect the actual function argument (i.e. so that it can be used for different dataframes).
Desired functionality:
df1 <- data.frame(a = c("A", "A", "B", "B"), b = 1:4)
generateVarMean(df1, a, b)
a b.mean
1 A 1.5
2 B 3.5
Any help getting pointed in the right direction very much appreciated.

We can make use of the quosure from the devel version of dplyr (soon to be released 0.6.0)
generateVarMean <- function(df, x, y) {
x <- enquo(x)
y <- enquo(y)
newName <- paste0(quo_name(y), ".mean")
df %>%
select(UQ(x), UQ(y)) %>%
group_by(UQ(x)) %>%
summarise(UQ(newName) := mean(UQ(y), na.rm = TRUE))
}
generateVarMean(df1, a, b)
# A tibble: 2 × 2
# a b.mean
# <fctr> <dbl>
#1 A 1.5
#2 B 3.5
We get the input arguments as quosure with enquo, convert the quosure to string with quo_name to create 'newName' string. Then, evaluate the quosure inside select/group_by/summarise by unquoting (UQ or !!). Note that in the new version, we can also assign the column names directly and using the assign operator (:=)

No need to add anything to the function. Just replace paste0("y", ".mean") with paste0(deparse(substitute(y)), ".mean")
So now the function and the output will be:
> generateVarMean <- function(df, x, y) {
mean.df <- df %>%
select(x, y) %>%
group_by(x) %>%
dplyr::summarise(variable.mean = mean(y, na.rm = TRUE))
colnames(mean.df)[2] <- paste0(deparse(substitute(y)), ".mean")
print(mean.df)
}
> generateVarMean(df, a, b)
# A tibble: 2 × 2
x b.mean
<fctr> <dbl>
1 A 1.5
2 B 3.5

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

dplyr pipeline in a function - r

Related

calculating the duration and the order of non-continuous events in R

How to use grep or any other method to compare different no of row in two data frame and get the match and mismatch?

How to use apply functions correctly when there are NA values

Apply function over data frame rows

How to use function arguement to set column name

Categories

Resources