dplyr pipeline in a function - r

I'm trying to put a dplyr pipeline in a function but after reading the vignette multiple times as well as the tidy evaluation (https://tidyeval.tidyverse.org/dplyr.html).
I still can't get it to work...
#Sample data:
dat <- read.table(text = "A ID B
1 X 83
2 X NA
3 X NA
4 Y NA
5 X 2
6 Y 2
12 Y 10
7 Y 18
8 Y 85", header = TRUE)
# What I'm trying to do:
x <- dat %>% filter(!is.na(B)) %>% count('ID') %>% filter(freq>3)
x$ID
# Now in a function:
n_occurences <- function(df, n, column){
# Group by ID and return IDs with number of non-na > n in column
column <- enquo(column)
x <- df %>%
filter(!is.na(!!column)) %>%
count('ID') %>% filter(freq>n)
x$ID
}
# Let's try:
col <- 'B'
n_occurences(dat, n=3, column = col)
There is no error, but the output is wrong. This as something to do with the tidy evaluation, but I just can't get my head around it.

With rlang_0.40, we can do this much easier by using the {{...}} or curly-curly operator
library(rlang)
library(dplyr)
n_occurences <- function(df, n1, column){
df %>%
filter(!is.na({{column}})) %>%
count(ID) %>%
filter(n > n1) %>%
pull(ID)
}
n_occurences(dat, n1 = 3, column = B)
#[1] Y
#Levels: X Y
If we intend to pass a quoted string, convert it to symbol (sym) and then do the evaluation (!!)
n_occurences <- function(df, n1, column){
column <- rlang::sym(column)
df %>%
filter(!is.na(!!column)) %>%
count(ID) %>%
filter(n > n1) %>%
pull(ID)
}
col <- 'B'
n_occurences(dat, n1=3, column = col)
#[1] Y
#Levels: X Y

Related

calculating the duration and the order of non-continuous events in R

My dataset consists of a series of behaviours observed in videos. For each behaviour, I have recorded when it starts and when it ends.
datain <-data.frame(
A=c("1/5+11/18","0/5","7/10"),
B=c("6/10+19/25","11/15","11/20"),
C=c("26/30","6/10","0/6"))
I would like to get the duration of each behaviour as well as the order of the behaviours for each observation, like in this desired output
dataout <-data.frame(
A=c("1/5+11/18","0/5","7/10"),
B=c("6/10+19/25","11/15","11/20"),
C=c("26/30","6/10","0/6"),
A.sum=c(11,5,3),
B.sum=c(10,4,9),
C.sum=c(4,4,6),
myorder=c("A/B/A/B/C","A/C/B","C/A/B"))
I am experimenting with the following lines to identify which columns have the + and to extract the rows with the interrupted behaviours (but I still have to calculate the duration of each behaviour), but I guess there could be more efficient solution than the one I am currently attempting.
d.1 <- lapply(datain, function(x) str_which(x,"\\+"))
d.2 <- which(lapply(d.1,length)>0)
coltosum <- match(names(d.2),colnames(datain))
mylist <- lapply(datain[coltosum],function(x) strsplit(x,"\\+"))
As always, I would greatly appreciate any suggestion.
Please note that I have edited this question after some days to include in the desired output the order of the behaviours.
Update: I have been able to figure out how to get the sequence of the behaviours. I bet there are more elegant and concise ways to get this result. Below the code
#removing empty columns
empty_columns <- sapply(datain, function(x) all(is.na(x) | x == ""))
datain<- datain[, !empty_columns]
#loop 1#
#this loop is for taking the occurrence of BH
mylist <- list()
for (i in seq(1,nrow(datain))){
mylist <- apply(datain,1,str_extract_all,pattern="\\d+")
myindx <- sapply(mylist, length)
myres <- c(do.call(cbind,lapply(mylist, `length<-`,max(myindx))))
names(myres) <- rep(colnames(datain),nrow(datain))
mydf <- ldply(myres,data.frame)
colnames(mydf) <- c("BH","values")
}
#loop 2#
#this loop is for counting the number of elements in a nested list
mydf.1 <- list()
myres.2 <- list()
for (i in seq(1,nrow(datain))){
mydf.1 <- length(unlist(mylist[i]))
myres.2[i] <- mydf.1
}
#this is for placing the row values
names(myres.2) <- rownames(datain)
myres.3 <- as.numeric(myres.2)
mydf$myrow <- c(rep(rownames(datain),myres.3))
#I can order by row and by values
mydf <- mydf[order(as.numeric(mydf$myrow),as.numeric(mydf$values)),]
#I have to pick up the right values
#I have to generate as many sequences as many elements for each row.
myseq <- sequence(myres.3)
mydf <- cbind(mydf,myseq)
myseq.2 <- seq(1,nrow(mydf),by=2)
#selecting the df according to the uneven row
mydf.1 <- mydf[myseq.2,]
myorder <-split(mydf.1,mydf.1$myrow)
#loop 3
myres.3 <- list()
for (i in seq(1,nrow(datain))){
myres.3 <- lapply(myorder,"[",i=1)
}
myorder.def <- data.frame(cbind(lapply(myres.3,paste0,collapse="/")))
colnames(myorder.def) <- "BH"
#last step, apply str_extract_all for each row
myorder.def$BH <- str_replace_all(myorder.def$BH,"c","")
myorder.def$BH <- str_replace_all(myorder.def$BH,"\\(","")
myorder.def$BH <- str_replace_all(myorder.def$BH,"\\)","")
myorder.def$BH <- str_replace_all(myorder.def$BH,"\"","")
myorder.def$BH <- str_replace_all(myorder.def$BH,", ","/")
data.out <- cbind(datain,myorder.def)
data.out
Stef
An option in base R would be to loop over the columns (lapply) of the dataset, then replace the digits (\\d+) followed by / and digits to denominator - numerator by capturing those digits and switching the backreferences (\\2-\\1), and eval(parse the string
datain[paste0(names(datain), ".sum")] <- lapply(datain, function(y)
sapply(gsub("(\\d+)/(\\d+)", "(\\2-\\1)", y),
function(x) eval(parse(text = x))))
-checking with OP's output
> datain
A B C A.sum B.sum C.sum
1 3/4+6/8+11/16 0/5+15/20 0/5 8 10 5
2 0/5 5/10 3/10 5 5 7
> dataout
A B C A.sum B.sum C.sum
1 3/4+6/8+11/16 0/5+10/5 0/5 8 10 5
2 0/5 5/10 3/10 5 5 7
Or with tidyverse, group by rows, loop across all the columns, read the string into a data.frame with read.table, subtract the columns, get the sum and return as new columns by modifying the .names
library(dplyr)
library(stringr)
datain %>%
rowwise %>%
mutate(across(everything(), ~ sum(with(read.table(text =
str_replace_all(.x, fixed("+"), "\n"), sep = "/",
header = FALSE), V2 - V1)), .names = "{.col}.sum")) %>%
ungroup
-output
# A tibble: 2 × 6
A B C A.sum B.sum C.sum
<chr> <chr> <chr> <int> <int> <int>
1 3/4+6/8+11/16 0/5+15/20 0/5 8 10 5
2 0/5 5/10 3/10 5 5 7
Another base R approach might be the following. First split by +, then split again by /, taking the sum of differences in the resulting values.
datain[paste0(names(datain), ".sum")] <-
lapply(datain, function(x) {
sapply(strsplit(x, "[+]"), function(y) {
sum(sapply(strsplit(y, "[/]"), function(z) {
diff(as.numeric(z)) }
))
})
})
datain
Output
A B C A.sum B.sum C.sum
1 3/4+6/8+11/16 0/5+15/20 0/5 8 10 5
2 0/5 5/10 3/10 5 5 7
Update:
Slightly improved:
library(dplyr)
library(tidyr)
library(data.table)
datain %>%
pivot_longer(everything()) %>%
separate_rows(value, sep = "\\+|\\/", convert = TRUE) %>%
group_by(group = rleid(name)) %>%
mutate(value = value - lag(value, default = value[1])) %>%
slice(which(row_number() %% 2 == 0)) %>%
mutate(value = sum(value),
name = paste0(name, ".sum")) %>%
slice(1) %>%
ungroup() %>%
select(-group) %>%
group_by(name) %>%
mutate(id = row_number()) %>%
pivot_wider(names_from = name, values_from = value) %>%
select(-id) %>%
cbind(datain)
This row
separate_rows(value, sep = "\\+|\\/", convert = TRUE) %>%
is same as
separate_rows(value, sep = "\\+") %>%
separate_rows(value, sep = "\\/") %>%
type.convert(as.is = TRUE) %>%
The very very long way until finish: :-)
library(dplyr)
library(tidyr)
library(data.table)
datain %>%
pivot_longer(everything()) %>%
separate_rows(value, sep = "\\+") %>%
separate_rows(value, sep = "\\/") %>%
group_by(group =as.integer(gl(n(),2,n()))) %>%
type.convert(as.is = TRUE) %>%
mutate(x = value - lag(value, default = value[1])) %>%
ungroup() %>%
group_by(group = rleid(name)) %>%
mutate(x = sum(x)) %>%
mutate(labels = paste0(name, ".sum")) %>%
slice(1) %>%
ungroup() %>%
select(-c(name, group, value)) %>%
pivot_wider(names_from = labels,
values_from = x,
values_fn = list) %>%
unnest(cols = c(A.sum, B.sum, C.sum)) %>%
cbind(datain)
A.sum B.sum C.sum A B C
1 8 10 5 3/4+6/8+11/16 0/5+15/20 0/5
2 5 5 7 0/5 5/10 3/10

How to use grep or any other method to compare different no of row in two data frame and get the match and mismatch?

Below are my two dataframe:
ABData1 <- data.frame(id=c(11,12,13,14,15),
a = c(1,2,3,4,5))
ABData2 <- data.frame(id=c(11,12,13,14),
b = c(1,4,3,4))
how to compare these two dataframe for matching rows and mismatch rows
if 1st row of ABData1 of a is matching with 1st row of ABData2 of b is matching then show as match and else show as mismatch and then goes to 2nd row....all the comparison will rowwise.
i have tried below code which is working fine for one data frame but its trowing error because of different rows in two data frames.
ABData <- data.frame(a = c(1,2,2,1,1),
b = c(1,2,1,1,2))
match<- ABData %>% rowwise() %>% filter(grepl(a,b, fixed = TRUE))
mismatch<- ABData %>% rowwise() %>% filter(!grepl(a,b))
I am expecting below output
Expected match Output:
id a expected b
11 1 1 1
13 3 3 3
14 4 4 4
Expected mismatch output:
id a expected b
12 2 2 4
15 NA NA 5
Thanks in advance.
You can use this:
ABData1 <- data.frame(a = c(1,2,3,4,5))
ABData2 <- data.frame(b = c(1,4,3,4))
equLength <- function(x, y) {
if (length(x)>length(y)) length(y) <- length(x) else length(x) <- length(y)
data.frame(a=x, b=y)
}
ABData <- equLength(ABData1$a, ABData2$b)
... and then use your working code for one dataframe.
library("dplyr")
resultMatch <- ABData %>% rowwise() %>% filter(grepl(a,b, fixed = TRUE))
resultMismatch <- ABData %>% rowwise() %>% filter(!grepl(a,b))
For the extended question:
library("dplyr")
ABData1 <- data.frame(id=c(11,12,13,14,15), a = c(1,2,3,4,5))
ABData2 <- data.frame(id=c(11,12,13,14), b = c(1,4,3,4))
equLength <- function(x, y) {
if (length(x)>length(y)) length(y) <- length(x) else length(x) <- length(y)
data.frame(a=x, b=y)
}
if (nrow(ABData1)>nrow(ABData2)) ABData <- data.frame(ABData1, b=equLength(ABData1$a, ABData2$b)$b) else
ABData <- data.frame(ABData2, a=equLength(ABData1$a, ABData2$b)$a)
resultMatch <- ABData %>% rowwise() %>% filter(grepl(a,b, fixed = TRUE))
resultMismatch <- ABData %>% rowwise() %>% filter(!grepl(a,b))

How to use apply functions correctly when there are NA values

I'd like to calculate a function on multiple columns of a dataframe with random NA values. I have two questions:
How to deal with NAs? The code runs when I try it on non-NA columns, but returns NA when there are NAs even though I remove them.
How to print the results in a dataframe format instead of multiple arrays? I used mapply but it doesn't seem to do the calculations correctly.
Here is my code:
#create a data frame with random NAs
df<-data.frame(category1 = sample(c(1:10),100,replace=TRUE),
category2 = sample(c(1:10),100,replace=TRUE)
)
insert_nas <- function(x) {
len <- length(x)
n <- sample(1:floor(0.2*len), 1)
i <- sample(1:len, n)
x[i] <- NA
x
}
df <- sapply(df, insert_nas) %>% as.data.frame()
df$type <- sample(c("A", "B", "C"),100,replace=TRUE)
#using apply:
library(NPS)
apply(df[,c('category1', 'category2')], 2,
function(x) df %>% filter(!is.na(x)) %>% group_by(type) %>%
transmute(nps(x)) %>% unique()
)
#results:
$category1
# A tibble: 3 x 2
# Groups: type [3]
type `nps(x)`
<chr> <dbl>
1 B NA
2 A NA
3 C NA
...
#using mapply
mapply(function(x) df %>% filter(!is.na(x)) %>% group_by(type) %>%
transmute(nps(x)) %>% unique(), df[,c('category1', 'category2')])
#results:
category1 category2
type Character,3 Character,3
nps(x) Numeric,3 Numeric,3
Regarding the function I use, it doesn't have a built in way to deal with NAs, so I remove NAs prior to calling it.
I still used the !is.na part of your code because it seems that nps can't deal with NA, even though the documentation said it should (possible bug). I changed your apply to lapply and passed the variables as the list. Then I used get to identify the variable name that appears in quotes as a variable in your df.
df<-data.frame(category1 = sample(c(1:10),100,replace=TRUE),
category2 = sample(c(1:10),100,replace=TRUE)
)
insert_nas <- function(x) {
len <- length(x)
n <- sample(1:floor(0.2*len), 1)
i <- sample(1:len, n)
x[i] <- NA
x
}
df <- sapply(df, insert_nas) %>% as.data.frame()
df$type <- sample(c("A", "B", "C"),100,replace=TRUE)
#using apply:
library(NPS)
df2 <- as.data.frame(lapply(c('category1', 'category2'),
function(x) df %>% filter(!is.na(get(x))) %>% group_by(type) %>%
transmute(nps(get(x))) %>% unique()
),stringsAsFactors = FALSE)
colnames(df2) <- c("type", "nps_cat1","type2","nps_cat2")
#type2 is redundant
df2 <- select(df2, -type2)

Apply function over data frame rows

I'm trying to apply a function over the rows of a data frame and return a value based on the value of each element in a column. I'd prefer to pass the whole dataframe instead of naming each variable as the actual code has many variables - this is a simple example.
I've tried purrr map_dbl and rowwise but can't get either to work. Any suggestions please?
#sample df
df <- data.frame(Y=c("A","B","B","A","B"),
X=c(1,5,8,23,31))
#required result
Res <- data.frame(Y=c("A","B","B","A","B"),
X=c(1,5,8,23,31),
NewVal=c(10,500,800,230,3100)
)
#use mutate and map or rowwise etc
Res <- df %>%
mutate(NewVal=map_dbl(.x=.,.f=FnAdd(.)))
Res <- df %>%
rowwise() %>%
mutate(NewVal=FnAdd(.))
#sample fn
FnAdd <- function(Data){
if(Data$Y=="A"){
X=Data$X*10
}
if(Data$Y=="B"){
X=Data$X*100
}
return(X)
}
If there are multiple values, it is better to have a key/val dataset, join and then do the mulitiplication
keyVal <- data.frame(Y = c("A", "B"), NewVal = c(10, 100))
df %>%
left_join(keyVal) %>%
mutate(NewVal = X*NewVal)
# Y X NewVal
#1 A 1 10
#2 B 5 500
#3 B 8 800
#4 A 23 230
#5 B 31 3100
It is not clear how many unique values are there in the actual dataset 'Y' column. If we have only a few values, then case_when can be used
FnAdd <- function(Data){
Data %>%
mutate(NewVal = case_when(Y == "A" ~ X * 10,
Y == "B" ~ X *100,
TRUE ~ X))
}
FnAdd(df)
# Y X NewVal
#1 A 1 10
#2 B 5 500
#3 B 8 800
#4 A 23 230
#5 B 31 3100
You were originally looking for a solution using dplyr's rowwise() function, so here is that solution. The nice thing about this approach is that you don't need to create a separate function.
Here's the version using if()
df %>%
rowwise() %>%
mutate(NewVal = ifelse(Y == "A", X * 10,
ifelse(Y == "B", X * 100)))
and here's the version using case_when:
df %>%
rowwise() %>%
mutate(NewVal = case_when(Y == "A" ~ X * 10,
Y == "B" ~ X * 100))

How to use function arguement to set column name

I have dataframe df as follows:
df <- data.frame(x = c("A", "A", "B", "B"), y = 1:4)
And I have a function that finds the mean of y grouped by x:
generateVarMean <- function(df, x, y) {
mean.df <- df %>%
select(x, y) %>%
group_by(x) %>%
dplyr::summarise(variable.mean = mean(y, na.rm = TRUE))
colnames(mean.df)[2] <- paste0("y", ".mean")
print(mean.df)
}
However, I want to the first argument of paste0() to reflect the actual function argument (i.e. so that it can be used for different dataframes).
Desired functionality:
df1 <- data.frame(a = c("A", "A", "B", "B"), b = 1:4)
generateVarMean(df1, a, b)
a b.mean
1 A 1.5
2 B 3.5
Any help getting pointed in the right direction very much appreciated.
We can make use of the quosure from the devel version of dplyr (soon to be released 0.6.0)
generateVarMean <- function(df, x, y) {
x <- enquo(x)
y <- enquo(y)
newName <- paste0(quo_name(y), ".mean")
df %>%
select(UQ(x), UQ(y)) %>%
group_by(UQ(x)) %>%
summarise(UQ(newName) := mean(UQ(y), na.rm = TRUE))
}
generateVarMean(df1, a, b)
# A tibble: 2 × 2
# a b.mean
# <fctr> <dbl>
#1 A 1.5
#2 B 3.5
We get the input arguments as quosure with enquo, convert the quosure to string with quo_name to create 'newName' string. Then, evaluate the quosure inside select/group_by/summarise by unquoting (UQ or !!). Note that in the new version, we can also assign the column names directly and using the assign operator (:=)
No need to add anything to the function. Just replace paste0("y", ".mean") with paste0(deparse(substitute(y)), ".mean")
So now the function and the output will be:
> generateVarMean <- function(df, x, y) {
mean.df <- df %>%
select(x, y) %>%
group_by(x) %>%
dplyr::summarise(variable.mean = mean(y, na.rm = TRUE))
colnames(mean.df)[2] <- paste0(deparse(substitute(y)), ".mean")
print(mean.df)
}
> generateVarMean(df, a, b)
# A tibble: 2 × 2
x b.mean
<fctr> <dbl>
1 A 1.5
2 B 3.5

Resources