Summarizing by multiple groups in R - r

I'm trying to summarize a dataset based on "station" and "depth bin" with total counts of family for each. This is how the dataset looks:
The end result should look like this"
...

Using dplyr,
Data
df <- read.table(text = "Family Station 'Total Count' 'Depth Bin'
Macrouridae 1504-04 1 2500-2550
Ophidiidae 1504-04 1 3500-3550
Synaphobranchidae 1504-05 2 3000-3050", header= TRUE)
Code
library(dplyr)
library(tidyr)
df %>%
group_by(Family,Station, Depth.Bin) %>%
summarise(n = sum(Total.Count)) %>%
mutate(newcol = paste0(c(Station, Depth.Bin), collapse = ":")) %>%
ungroup() %>%
select(Family, n, newcol) %>%
mutate(row = row_number()) %>%
pivot_wider(names_from = newcol, values_from = n) %>%
select(-row)
Family `1504-04:2500-2550` `1504-04:3500-3550` `1504-05:3000-3050`
<chr> <int> <int> <int>
1 Macrouridae 1 NA NA
2 Ophidiidae NA 1 NA
3 Synaphobranchidae NA NA 2

Base-R version, with tapply (I changed some of your variable names to avoid spaces):
dd <- read.table(header = TRUE, text = "
Family Station Total_Count Depth_Bin
Macrouridae 1504-04 1 2500-2550
Ophidiidae 1504-04 1 3500-3550
Synaphobranchidae 1504-05 2 3000-3050
")
with(dd, tapply(
Total_Count,
list(Family, interaction(Station, Depth_Bin, sep = ":")),
FUN = sum))

Related

calculating the duration and the order of non-continuous events in R

My dataset consists of a series of behaviours observed in videos. For each behaviour, I have recorded when it starts and when it ends.
datain <-data.frame(
A=c("1/5+11/18","0/5","7/10"),
B=c("6/10+19/25","11/15","11/20"),
C=c("26/30","6/10","0/6"))
I would like to get the duration of each behaviour as well as the order of the behaviours for each observation, like in this desired output
dataout <-data.frame(
A=c("1/5+11/18","0/5","7/10"),
B=c("6/10+19/25","11/15","11/20"),
C=c("26/30","6/10","0/6"),
A.sum=c(11,5,3),
B.sum=c(10,4,9),
C.sum=c(4,4,6),
myorder=c("A/B/A/B/C","A/C/B","C/A/B"))
I am experimenting with the following lines to identify which columns have the + and to extract the rows with the interrupted behaviours (but I still have to calculate the duration of each behaviour), but I guess there could be more efficient solution than the one I am currently attempting.
d.1 <- lapply(datain, function(x) str_which(x,"\\+"))
d.2 <- which(lapply(d.1,length)>0)
coltosum <- match(names(d.2),colnames(datain))
mylist <- lapply(datain[coltosum],function(x) strsplit(x,"\\+"))
As always, I would greatly appreciate any suggestion.
Please note that I have edited this question after some days to include in the desired output the order of the behaviours.
Update: I have been able to figure out how to get the sequence of the behaviours. I bet there are more elegant and concise ways to get this result. Below the code
#removing empty columns
empty_columns <- sapply(datain, function(x) all(is.na(x) | x == ""))
datain<- datain[, !empty_columns]
#loop 1#
#this loop is for taking the occurrence of BH
mylist <- list()
for (i in seq(1,nrow(datain))){
mylist <- apply(datain,1,str_extract_all,pattern="\\d+")
myindx <- sapply(mylist, length)
myres <- c(do.call(cbind,lapply(mylist, `length<-`,max(myindx))))
names(myres) <- rep(colnames(datain),nrow(datain))
mydf <- ldply(myres,data.frame)
colnames(mydf) <- c("BH","values")
}
#loop 2#
#this loop is for counting the number of elements in a nested list
mydf.1 <- list()
myres.2 <- list()
for (i in seq(1,nrow(datain))){
mydf.1 <- length(unlist(mylist[i]))
myres.2[i] <- mydf.1
}
#this is for placing the row values
names(myres.2) <- rownames(datain)
myres.3 <- as.numeric(myres.2)
mydf$myrow <- c(rep(rownames(datain),myres.3))
#I can order by row and by values
mydf <- mydf[order(as.numeric(mydf$myrow),as.numeric(mydf$values)),]
#I have to pick up the right values
#I have to generate as many sequences as many elements for each row.
myseq <- sequence(myres.3)
mydf <- cbind(mydf,myseq)
myseq.2 <- seq(1,nrow(mydf),by=2)
#selecting the df according to the uneven row
mydf.1 <- mydf[myseq.2,]
myorder <-split(mydf.1,mydf.1$myrow)
#loop 3
myres.3 <- list()
for (i in seq(1,nrow(datain))){
myres.3 <- lapply(myorder,"[",i=1)
}
myorder.def <- data.frame(cbind(lapply(myres.3,paste0,collapse="/")))
colnames(myorder.def) <- "BH"
#last step, apply str_extract_all for each row
myorder.def$BH <- str_replace_all(myorder.def$BH,"c","")
myorder.def$BH <- str_replace_all(myorder.def$BH,"\\(","")
myorder.def$BH <- str_replace_all(myorder.def$BH,"\\)","")
myorder.def$BH <- str_replace_all(myorder.def$BH,"\"","")
myorder.def$BH <- str_replace_all(myorder.def$BH,", ","/")
data.out <- cbind(datain,myorder.def)
data.out
Stef
An option in base R would be to loop over the columns (lapply) of the dataset, then replace the digits (\\d+) followed by / and digits to denominator - numerator by capturing those digits and switching the backreferences (\\2-\\1), and eval(parse the string
datain[paste0(names(datain), ".sum")] <- lapply(datain, function(y)
sapply(gsub("(\\d+)/(\\d+)", "(\\2-\\1)", y),
function(x) eval(parse(text = x))))
-checking with OP's output
> datain
A B C A.sum B.sum C.sum
1 3/4+6/8+11/16 0/5+15/20 0/5 8 10 5
2 0/5 5/10 3/10 5 5 7
> dataout
A B C A.sum B.sum C.sum
1 3/4+6/8+11/16 0/5+10/5 0/5 8 10 5
2 0/5 5/10 3/10 5 5 7
Or with tidyverse, group by rows, loop across all the columns, read the string into a data.frame with read.table, subtract the columns, get the sum and return as new columns by modifying the .names
library(dplyr)
library(stringr)
datain %>%
rowwise %>%
mutate(across(everything(), ~ sum(with(read.table(text =
str_replace_all(.x, fixed("+"), "\n"), sep = "/",
header = FALSE), V2 - V1)), .names = "{.col}.sum")) %>%
ungroup
-output
# A tibble: 2 × 6
A B C A.sum B.sum C.sum
<chr> <chr> <chr> <int> <int> <int>
1 3/4+6/8+11/16 0/5+15/20 0/5 8 10 5
2 0/5 5/10 3/10 5 5 7
Another base R approach might be the following. First split by +, then split again by /, taking the sum of differences in the resulting values.
datain[paste0(names(datain), ".sum")] <-
lapply(datain, function(x) {
sapply(strsplit(x, "[+]"), function(y) {
sum(sapply(strsplit(y, "[/]"), function(z) {
diff(as.numeric(z)) }
))
})
})
datain
Output
A B C A.sum B.sum C.sum
1 3/4+6/8+11/16 0/5+15/20 0/5 8 10 5
2 0/5 5/10 3/10 5 5 7
Update:
Slightly improved:
library(dplyr)
library(tidyr)
library(data.table)
datain %>%
pivot_longer(everything()) %>%
separate_rows(value, sep = "\\+|\\/", convert = TRUE) %>%
group_by(group = rleid(name)) %>%
mutate(value = value - lag(value, default = value[1])) %>%
slice(which(row_number() %% 2 == 0)) %>%
mutate(value = sum(value),
name = paste0(name, ".sum")) %>%
slice(1) %>%
ungroup() %>%
select(-group) %>%
group_by(name) %>%
mutate(id = row_number()) %>%
pivot_wider(names_from = name, values_from = value) %>%
select(-id) %>%
cbind(datain)
This row
separate_rows(value, sep = "\\+|\\/", convert = TRUE) %>%
is same as
separate_rows(value, sep = "\\+") %>%
separate_rows(value, sep = "\\/") %>%
type.convert(as.is = TRUE) %>%
The very very long way until finish: :-)
library(dplyr)
library(tidyr)
library(data.table)
datain %>%
pivot_longer(everything()) %>%
separate_rows(value, sep = "\\+") %>%
separate_rows(value, sep = "\\/") %>%
group_by(group =as.integer(gl(n(),2,n()))) %>%
type.convert(as.is = TRUE) %>%
mutate(x = value - lag(value, default = value[1])) %>%
ungroup() %>%
group_by(group = rleid(name)) %>%
mutate(x = sum(x)) %>%
mutate(labels = paste0(name, ".sum")) %>%
slice(1) %>%
ungroup() %>%
select(-c(name, group, value)) %>%
pivot_wider(names_from = labels,
values_from = x,
values_fn = list) %>%
unnest(cols = c(A.sum, B.sum, C.sum)) %>%
cbind(datain)
A.sum B.sum C.sum A B C
1 8 10 5 3/4+6/8+11/16 0/5+15/20 0/5
2 5 5 7 0/5 5/10 3/10

Search elements of a single character string in a dataframe column to subset it

I have two dataframes:
set.seed(1)
df1 <- data.frame(k1 = "AFD(1);Acf(2);Vgr7(2);"
,k2 = "ABC(7);BHG(46);TFG(675);")
df2 <- data.frame(site =c("AFD(1);AFD(2);", "Acf(2);", "TFG(677);",
"XX(275);", "ABC(7);", "ABC(9);")
,p1 = rnorm(6, mean = 5, sd = 2)
,p2 = rnorm(6, mean = 6.5, sd = 2))
The first dataframe is in fact a list of often very long strings, made of 'elements". Each "element" is made of a few letters/numbers, followed by a number in brackets, followed by a semicolon. In this example I only put 3 "elements" into each string, but in my real dataframe there are tens to hundreds of them.
> df1
k1 k2
1 AFD(1);Acf(2);Vgr7(2); ABC(7);BHG(46);TFG(675);
The second dataframe shares some of the "elements" with df1. Its first column, called site, contains some (not all) "elements" from the first dataframe, sometimes the "element" forms the whole string, and sometimes is a part of a longer string:
> df2
site p1 p2
1 AFD(1);AFD(2); 4.043700 3.745881
2 Acf(2); 5.835883 5.670011
3 TFG(677); 7.717359 5.711420
4 XX(275); 4.794425 6.381373
5 ABC(7); 5.775343 8.700051
6 ABC(9); 4.892390 8.026351
I would like to filter the whole df2 using df2$site and each k column from df1 (there are many K columns, not all of them contain k in the names).
The easiest way to explain this is to show how the desired output would look like.
> outcome
k site p1 p2
1 k1 AFD(1);AFD(2): 4.043700 3.745881
2 k1 Acf(2); 5.835883 5.670011
3 k2 ABC(7); 5.775343 8.700051
The first column of the outcome dataframe corresponds to the column names in df1. The second column corresponds to the site column of df2 and contains only sites from df1 columns that were found in df2$sites. Other columns are from df2.
I appreciate that this question is made of two separate "problems", one grepping-related and one related to looping through df1 columns. I decided to show the task in its entirety in case there exists a solution that addresses both in one go.
FAILED SOLUTION 1
I can create a string to grep, but for each column separately:
# this replaces the semicolons with "|", but does not escape the brackets.
k1_pattern <- df1 %>%
select(k1) %>%
deframe() %>%
str_replace_all(";","|")
And then I am not sure how to use it. This (below) didn't work, maybe because I didn't escape brackets, but I am struggling with doing it:
k1_result <- df2 %>%
filter(grepl(pattern = k1_pattern, site))
But even if it did work, it would only deal with a single column from df1, and I have many, and would like to perform this operation on all df1 columns at the same time.
FAILED SOLUTION 2
I can create a list of sites to search in df2 from columns in df1:
k1_sites<- df1 %>%
select(k1) %>%
deframe() %>%
strsplit(., "[;]") %>%
unlist()
but the delimiter is lost here, and %in% cannot be used, as the match will sometimes be partial.
library(dplyr)
df2 %>%
mutate(site_list = strsplit(site, ";")) %>%
rowwise() %>%
filter(length(intersect(site_list,
unlist(strsplit(x = paste0(c(t(df1)), collapse=""),
split = ";")))) != 0) %>%
select(-site_list)
#> # A tibble: 3 x 3
#> # Rowwise:
#> site p1 p2
#> <chr> <dbl> <dbl>
#> 1 AFD(1);AFD(2); 3.75 7.47
#> 2 Acf(2); 5.37 7.98
#> 3 ABC(7); 5.66 9.52
Updated answer:
library(dplyr)
library(tidyr)
df1 %>%
rownames_to_column("id") %>%
pivot_longer(-id, names_to = "k", values_to = "site") %>%
separate_rows(site, sep = ";") %>%
filter(site != "") %>%
select(-id) -> df1_k
df2 %>%
tibble::rownames_to_column("id") %>%
separate_rows(site, sep = ";") %>%
full_join(., df1_k, by = c("site")) %>%
group_by(id) %>%
fill(k, .direction = "downup") %>%
filter(!is.na(id) & !is.na(k)) %>%
summarise(k = first(k),
site = paste0(site, collapse = ";"),
p1 = first(p1),
p2 = first(p2), .groups = "drop") %>%
select(-id)
#> # A tibble: 3 x 4
#> k site p1 p2
#> <chr> <chr> <dbl> <dbl>
#> 1 k1 AFD(1);AFD(2); 3.75 7.47
#> 2 k1 Acf(2); 5.37 7.98
#> 3 k2 ABC(7); 5.66 9.52
Here's a way going to a long format for exact matching (so no regex):
library(dplyr)
library(tidyr)
df1_long = df1 |> stack() |>
separate_rows(values, sep = ";") |>
filter(values != "")
df2 |>
mutate(id = row_number()) |>
separate_rows(site, sep = ";") |>
filter(site != "") |>
left_join(df1_long, by = c("site" = "values")) %>%
group_by(id) |>
filter(any(!is.na(ind))) %>%
summarize(
site = paste(site, collapse = ";"),
across(-site, \(x) first(na.omit(x)))
)
# # A tibble: 3 × 5
# id site p1 p2 ind
# <int> <chr> <dbl> <dbl> <fct>
# 1 1 AFD(1);AFD(2) 3.75 7.47 k1
# 2 2 Acf(2) 5.37 7.98 k1
# 3 5 ABC(7) 5.66 9.52 k2

How to summarise grouped value increases

I have this type of data:
df <- data.frame(
Utt = c(rep("oh", 10), rep("ah", 10)),
name = rep(LETTERS[1:2], 10),
value = c(0.5,2,2,2,2,1,0,1,3.5,1,
2.2,2.3,1.9,0.1,0.3,1.8,3,4,3.5,2)
)
I need to know whether within in each group of Utt and name, there are continuous value increases and how large these increases are.
EDIT: I've cobbled together this code, which produces the right result but seems convoluted:
df %>%
# order by name:
arrange(name) %>%
group_by(name, Utt) %>%
# mutate:
mutate(
# is there an increase from one value to the next?
is_increase = ifelse(lag(value) < value, value, NA),
# what's the difference between these values?
diff = is_increase - lag(value)) %>%
group_by(name, Utt, grp = rleid(!is.na(diff))) %>%
# sum the contiguous values:
summarise(increase_size = sum(diff, na.rm = TRUE)) %>%
# remove 0 values:
filter(!increase_size == 0) %>%
# put same-group increase_sizes in the same row:
summarise(
increase_size = str_c(increase_size, collapse = ', '))
# A tibble: 3 x 3
# Groups: name [2]
name Utt increase_size
<chr> <chr> <chr>
1 A ah 3.2
2 A oh 1.5, 3.5
3 B ah 3.9
NOTE: Ideally, the expected outcome would be:
1 A ah 3.2
2 A oh 1.5, 3.5
3 B ah 3.9
4 B oh NA
Is there a better (i.e., more concise, more clever) dplyr solution?
Use this function to find what you want.
f <- function(x) {
ind <- which(x > lag(x))
if (length(ind) == 0) {
return(NA)
}
ind2 <- ind[which(lead(ind, default = max(ind) + 2) - ind > 1)]
ind1 <- ind[which(ind - lag(ind, default = min(ind) - 2) > 1)] - 1
return(paste0(x[ind2] - x[ind1], collapse = ", "))
}
And use the function in summarise:
df %>% group_by(name, Utt) %>% summarise(increase = f(value))
Using tidyverse, my solution was similar to yours. One possible modification might be to subset your columns before summing instead of filtering. This will keep all combinations of name and Utt and allow for NA for increase_size in the end. Since the column increase_size is character type, you can convert an empty string to NA.
library(data.table)
library(tidyverse)
df %>%
arrange(name) %>%
group_by(name, Utt) %>%
mutate(diff = c(0, diff(value))) %>%
group_by(grp = rleid(diff < 0), .add = T) %>%
summarise(increase_size = sum(diff[diff > 0], na.rm = T)) %>%
group_by(name, Utt) %>%
summarise(increase_size = toString(increase_size[increase_size > 0])) %>%
mutate(increase_size = na_if(increase_size, ""))
Output
name Utt increase_size
<chr> <chr> <chr>
1 A ah 3.2
2 A oh 1.5, 3.5
3 B ah 3.9
4 B oh NA

convert values in a column consists of text map into integers and aggregate them

I have a dataframe as following:
data.frame("id" = 1:2, "tag" = c("a,b,c","a,d"))
id tag
1 a,b,c
2 a,d
in tag where ever is a or b consider as lan and and "d"="c"="con" means that a and b are consider as lan , d and c consider as con then we want to count the number of lan and con in each row in seperate columns like table in below:
I want to create two columns which are the aggregation of a,b,c to shows like the follows:
id tag. lan_count. con_count
1 a,b,c 2 1
2 a,d 1 1
Could you please give me advice how to do this.
You can also use the following code:
library(dplyr)
library(tidyr)
df <- data.frame("id" = 1:2, "tag" = c("a,b,c","a,d"))
df %>%
separate_rows(tag, sep = ",") %>%
group_by(id) %>%
add_count(tag) %>%
pivot_wider(id, names_from = tag, values_from = n) %>%
rowwise() %>%
mutate(lan_count = sum(c_across(a:b), na.rm = TRUE),
con_count = sum(c_across(c:d), na.rm = TRUE)) %>%
select(-c(a:d))
# A tibble: 2 x 3
# Rowwise: id
id lan_count con_count
<int> <int> <int>
1 1 2 1
2 2 1 1
The main issue here is that your data is untidy. So my solution is in two parts: first, tidy the data and then summarise it. Once the data is tidy, the summary is trivial.
library(tidyverse)
# Adjust to suit your real data
maxCols <- 10
d <- data.frame(id = 1:2, tag = c("a,b,c","a,d"))
d %>%
separate(
tag,
sep=",",
into=paste0("Element", 1:maxCols),
extra="drop",
fill="right",
remove=FALSE
) %>%
pivot_longer(
cols=starts_with("Element"),
values_to="Value",
names_prefix="Element"
) %>%
select(-name) %>%
# Remove unused Values
filter(!is.na(Value)) %>%
# At this point the data frame is tidy
group_by(tag) %>%
# Translate tags into "categories". Add more if required. or write a function
mutate(
lan=Value %in% c("a", "b"),
con=Value %in% c("c", "d")
) %>%
# Adjust the column specification if more categories are added.
# Or use a factor instead of binary indicators
summarise(across(lan:con, sum))
# A tibble: 2 x 3
tag lan con
* <fct> <int> <int>
1 a,b,c 2 1
2 a,d 1 1

R: How do I c() nested character vectors grouped by another column?

I have strings containing enumerations of words grouped under word type. The example below only has one type for simplicity's sake.
ka = tibble(
words = c('apple, orange', 'pear, apple, plum'),
type = 'fruit'
)
I want to find out the number of UNIQUE words per type.
I figured I would split the character vectors,
ka = ka %>%
mutate(
word_list = str_split(words, ', ')
)
and then bind the columns per group. The end result would be
c(
ka$word_list[[1]],
ka$word_list[[2]],
)
Then I can unique these vectors and get their length.
I don't know how to bind columns together, grouped by a separate column. I could do this with an ugly loop within a loop, but there must be a map/apply solution as well, following the logic of:
ka %>%
group_by(type) %>%
summarise(
biglist = map(word_list, ~ c(.)), # this doesn't work, obviously
biglist_unique = map(biglist, ~ unique(.)),
biglist_length = map(biglist_unique, ~ length(.))
)
Here is an option for you. First we collapse the vectors, then we map out what you're looking for. Note that we have to trim off the whitespace to get the proper unique words.
library(tidyverse)
ka %>%
group_by(type) %>%
summarise(all_words = paste(words, collapse = ",")) %>%
mutate(biglist = str_split(all_words, ",") %>% map(., ~str_trim(.x, "both")),
biglist_unique = map(biglist, ~.x[unique(.x)]),
biglist_length = map_dbl(biglist_unique, length))
#> # A tibble: 1 x 5
#> type all_words biglist biglist_unique biglist_length
#> <chr> <chr> <list> <list> <dbl>
#> 1 fruit apple, orange,pear, apple, plum <chr [5]> <chr [4]> 4
Another option would be to use tidy data principles and the tidyr package.
ka = ka %>%
mutate(
word_list = str_split(words, ', ')
)
ka %>%
# If you need to maintain information about each row you can create an index
# mutate(index = row_number()) %>%
# unnest the wordlist to get one word per row
unnest(word_list) %>%
# Only keep unique words per group
group_by(type) %>%
distinct(word_list, .keep_all = FALSE) %>% # if you need to maintain row info .keep_all = TRUE
summarise(n_unique = n())
# A tibble: 1 x 2
# type n_unique
# <chr> <int>
# 1 fruit 4
Here's a way you can do using separate_rows:
ka %>%
separate_rows(words, sep = ', ') %>%
group_by(type) %>%
summarise(word_c = n_distinct(words))
Something like this:
library(tidyverse)
ka %>%
mutate(words = strsplit(as.character(words), ",")) %>%
unnest(words) %>%
mutate(words = gsub(" ","",words)) %>%
group_by(type) %>%
summarise(number = n_distinct(words),
words = paste0(unique(words), collapse =' '))
# A tibble: 1 x 3
type number words
<chr> <int> <chr>
1 fruit 4 apple orange pear plum

Resources