How to reduce aligned sequences to start and end coordinates? - r

I have a data table of labelled coordinates that are aligned between two groups (A and B). For example:
dt_long <- data.table(LABEL_A = c(rep("A", 20), rep("A", 15), rep ("A", 10), rep ("A", 15), rep ("A", 10)),
SEQ_A = c(11:30, 61:75, 76:85, 86:100, 110:119),
LABEL_B= c(rep("C", 20), rep("D", 15), rep("F", 10), rep("G",15), rep("D", 10)),
SEQ_B = c(1:20, 25:11, 16:25, 15:1, 1:5, 8:12))
How can I reduce this information into a short format, where the start and end coordinates for each aligned sequence are given. For example:
dt_short <- data.table(LABEL_A = c("A", "A", "A", "A", "A", "A"),
Start_A = c(11, 61, 76, 86, 110, 115),
End_A = c(30, 75, 85, 100, 114, 119),
LABEL_B= c("C", "D", "F", "G", "D", "D"),
Start_B = c(1, 25, 16, 15, 1, 8),
End_B = c(20, 11, 25, 1, 5, 12))
The length of each aligned sequence should be identical. For example:
identical(abs(dt_short$End_A - dt_short$Start_A), abs(dt_short$End_B - dt_short$Start_B))

You can make use of rleid and incorporating Frank's comment to remove grouping column
dt_long[, .(
LABEL_A=LABEL_A[1L], Start_A=SEQ_A[1L], End_A=SEQ_A[.N],
LABEL_B=LABEL_B[1L], Start_B=SEQ_B[1L], End_B=SEQ_B[.N]),
by=rleid(LABEL_A, LABEL_B,
c(0L, cumsum(diff(SEQ_A) > 1L)),
c(0L, cumsum(diff(SEQ_B) > 1L)))][, (1) := NULL]
output:
LABEL_A Start_A End_A LABEL_B Start_B End_B
1: A 11 30 C 1 20
2: A 61 75 D 25 11
3: A 76 85 F 16 25
4: A 86 100 G 15 1
5: A 110 114 D 1 5
6: A 115 119 D 8 12

A straight forward way is to group by the two labels and get the first and last of each group, i.e.
library(data.table)
dt_long[, .(Start_A = first(SEQ_A), End_A = last(SEQ_A), Start_B = first(SEQ_B), End_B = last(SEQ_B)), by = .(LABEL_A, LABEL_B)][]
# LABEL_A LABEL_B Start_A End_A Start_B End_B
#1: 1 3 11 30 1 20
#2: 1 4 61 75 25 11
#3: 1 6 76 85 16 25
#4: 1 7 86 100 15 1

We can just subset and dcast. Would also work seamlessly when there are many different groups of columns
dcast(dt_long[, .SD[c(1, .N)], .(LABEL_A, LABEL_B)],
LABEL_A + LABEL_B ~ c("Start", "End")[rowid(LABEL_A, LABEL_B)],
value.var = c("SEQ_A", "SEQ_B"))
# LABEL_A LABEL_B SEQ_A_End SEQ_A_Start SEQ_B_End SEQ_B_Start
#1: 1 3 30 11 20 1
#2: 1 4 75 61 11 25
#3: 1 6 85 76 25 16
#4: 1 7 100 86 1 15

Related

How to quickly count number of unique entries after merging tables

library(data.table)
table1 <- data.table(id1 = c(1324, 2324, 29, 29, 1010, 1010),
type = c(1, 1, 2, 1, 1, 1),
class = c("A", "A", "B", "D", "D", "A"),
number = c(1, 98, 100, 100, 70, 70))
table2 <- data.table(id2 = c(1998, 1998, 2000, 2000, 2000, 2010, 2012, 2012),
type = c(1, 1, 3, 1, 1, 5, 1, 1),
class = c("D", "A", "D", "D", "A", "B", "A", "A"),
min_number = c(34, 0, 20, 45, 5, 23, 1, 1),
max_number = c(50, 100, 100, 100, 100, 9, 10, 100))
> table1
id1 type class number
1: 1324 1 A 1
2: 2324 1 A 98
3: 29 2 B 100
4: 29 1 D 100
5: 1010 1 D 70
6: 1010 1 A 70
> table2
id2 type class min_number max_number
1: 1998 1 D 34 50
2: 1998 1 A 0 100
3: 2000 3 D 20 100
4: 2000 1 D 45 100
5: 2000 1 A 5 100
6: 2010 5 B 23 9
7: 2012 1 A 1 10
8: 2012 1 A 1 100
I have two tables, and I would like to merge them based on type, class, and whether number lies between min_number and max_number. Then I would like to create a new variable nMatch that stores the number of unique id2s that match with each id1.
setindexv(table2, c("type", "class"))
for (t1_row in seq_len(nrow(table1))) {
print(t1_row)
set(
table1, t1_row, "matches",
table2[table1[t1_row], on = c("type", "class", "max_number >= number", "min_number <= number"), .(list(id2))]
)
}
> table1[, .(nMatch = uniqueN(unlist(matches), na.rm = TRUE)), by = .(id1)]
id1 nMatch
1: 1324 2
2: 2324 3
3: 29 1
4: 1010 3
The approach above is row-by-row as suggested here, but my real dataset has millions of rows. What's another way of doing this that's faster?
You can try data.table with on = .(...) to merge two data tables
table1[
table2,
.(id1, id2),
on = .(type, class, number >= min_number, number <= max_number),
nomatch = NULL
][
,
.(nMatch = uniqueN(id2)),
id1
]
and will get
id1 nMatch
1: 1324 2
2: 1010 3
3: 2324 3
4: 29 1
An option with tidyverse
library(dplyr)
library(tidyr)
left_join(table1, table2, by =
join_by(type, class, number >= min_number, number <= max_number)) %>%
distinct(id1, id2) %>%
drop_na %>%
count(id1, name = "nMatch")
-output
id1 nMatch
<num> <int>
1: 29 1
2: 1010 3
3: 1324 2
4: 2324 3

How to subtract value of one group from other groups in R

I am trying to subtract the value of one group from another. I am hoping to use tidyverse
structure(list(A = c(1, 1, 1, 2, 2, 2, 3, 3, 3), group = c("a",
"b", "c", "a", "b", "c", "a", "b", "c"), value = c(10, 11, 12,
11, 40, 23, 71, 72, 91)), class = "data.frame", row.names = c(NA,
-9L))
That is my data, and I want to subtract all values of group A from B and C, and store the difference in one variable.
baseR solution
df$new <- df$value - ave(df$value, df$A, FUN = function(x) mean(x[df$group == 'a'], na.rm = T) )
> df
A group value new
1 1 a 10 0
2 1 b 11 1
3 1 c 12 2
4 2 a 11 0
5 2 b 40 29
6 2 c 23 12
7 3 a 71 0
8 3 b 72 1
9 3 c 91 20
dplyr method (assumption there is not more than one a value per group, else R will confuse which value to substract and result in error)
df %>% group_by(A) %>% mutate(new = ifelse(group != 'a', value - value[group == 'a'], value) )
# A tibble: 9 x 4
# Groups: A [3]
A group value new
<dbl> <chr> <dbl> <dbl>
1 1 a 10 10
2 1 b 11 1
3 1 c 12 2
4 2 a 11 11
5 2 b 40 29
6 2 c 23 12
7 3 a 71 71
8 3 b 72 1
9 3 c 91 20
or if you want to change all values
df %>% group_by(A) %>% mutate(new = value - value[group == 'a'] )
# A tibble: 9 x 4
# Groups: A [3]
A group value new
<dbl> <chr> <dbl> <dbl>
1 1 a 10 0
2 1 b 11 1
3 1 c 12 2
4 2 a 11 0
5 2 b 40 29
6 2 c 23 12
7 3 a 71 0
8 3 b 72 1
9 3 c 91 20
I only used data.table rather than data.frame because I'm more familiar.
library(data.table)
data <- setDT(structure(list(A = c(1, 1, 1, 2, 2, 2, 3, 3, 3), group = c("a",
"b", "c", "a", "b", "c", "a", "b", "c"), value = c(10, 11, 12,
11, 40, 23, 71, 72, 91)), class = "data.frame", row.names = c(NA,-9L)))
for (i in 1:length(unique(data$A))){
data[A == i, substraction := data[A == i, 'value'] - data[A == i & group == 'a', value]]
}

Pivot from wide one time-repeated column to wide [duplicate]

This question already has an answer here:
Using Reshape from wide to long in R [closed]
(1 answer)
Closed 2 years ago.
Suppose I have the following data in that wide format:
data = tibble::tribble(
~ID, ~Time, ~Value, ~ValueX,
"A", 1, 11, 41,
"A", 2, 12, 42,
"A", 3, 13, 43,
"B", 1, 21, 41,
"B", 2, 22, 42,
"B", 3, 23, 43,
"C", 1, 31, 41,
"C", 2, 32, 42,
"C", 3, 33, 43
)
Since ValueX is a repeated variable that does not vary within ID group variable, I just want to add it as new rows identified by ID. This will be the desired output:
data.desired = tibble::tribble(
~ID, ~Time, ~Value,
"A", 1, 11,
"A", 2, 12,
"A", 3, 13,
"B", 1, 21,
"B", 2, 22,
"B", 3, 23,
"C", 1, 31,
"C", 2, 32,
"C", 3, 33,
"ValueX", 1, 41,
"ValueX", 2, 42,
"ValueX", 3, 41
)
Here is a way via base R. You can aggregate ValueX per Time and get the first observation each. Then create a data frame with same names as your original data and simply rbind, i.e.
rbind(data[-ncol(data)],
setNames(data.frame('ValueX', aggregate(ValueX ~ Time, data, head, 1)),
names(data[-ncol(data)])))
# A tibble: 12 x 3
# ID Time Value
# <chr> <dbl> <dbl>
# 1 A 1 11
# 2 A 2 12
# 3 A 3 13
# 4 B 1 21
# 5 B 2 22
# 6 B 3 23
# 7 C 1 31
# 8 C 2 32
# 9 C 3 33
#10 ValueX 1 41
#11 ValueX 2 42
#12 ValueX 3 43
use tidyverse
addCase <- distinct(data, Time, ValueX) %>%
pivot_longer(-Time, names_to = "ID", values_to = "Value")
data %>%
select(-ValueX) %>%
add_case(addCase)
# A tibble: 12 x 3
ID Time Value
<chr> <dbl> <dbl>
1 A 1 11
2 A 2 12
3 A 3 13
4 B 1 21
5 B 2 22
6 B 3 23
7 C 1 31
8 C 2 32
9 C 3 33
10 ValueX 1 41
11 ValueX 2 42
12 ValueX 3 43

Is there a way to create new columns in R based on manipulations from multiple data frames?

Does anyone know if it is possible to use a variable in one dataframe (in my case the "deploy" dataframe) to create a variable in another dataframe?
For example, I have two dataframes:
df1:
deploy <- data.frame(ID = c("20180101_HH1_1_1", "20180101_HH1_1_2", "20180101_HH1_1_3"),
Site_Depth = c(42, 93, 40), Num_Depth_Bins_Required = c(5, 100, 4),
Percent_Column_in_each_bin = c(20, 10, 25))
df2:
sp.c <- data.frame(species = c("RR", "GS", "GT", "BR", "RS", "BA", "GS", "RS", "SH", "RR"),
ct = c(25, 66, 1, 12, 30, 6, 1, 22, 500, 6),
percent_dist_from_surf = c(11, 15, 33, 68, 71, 100, 2, 65, 5, 42))
I want to create new columns in df2 that assigns each species and count to a bin based on the Percent_Column_in_each_bin for each ID. For example, in 20180101_HH1_1_3 there would be 4 bins that each make up 25% of the column and all species that are within 0-25% of the column (in df2) would be in bin 1 and species within 25-50% of the column would be in depth bin 2, and so on. What I'm imagining this looking like is:
i.want.this <- data.frame(species = c("RR", "GS", "GT", "BR", "RS", "BA", "GS", "RS", "SH", "RR"),
ct = c(25, 66, 1, 12, 30, 6, 1, 22, 500, 6),
percent_dist_from_surf = c(11, 15, 33, 68, 71, 100, 2, 65, 5, 42),
'20180101_HH1_1_1_Bin' = c(1, 1, 2, 4, 4, 5, 1, 4, 1, 3),
'20180101_HH1_1_2_Bin' = c(2, 2, 4, 7, 8, 10, 1, 7, 1, 5),
'20180101_HH1_1_3_Bin' = c(1, 1, 2, 3, 3, 4, 1, 3, 1, 2))
I am pretty new to R and I'm not sure how to make this happen. I need to do this for over 100 IDs (all with different depths, number of depth bins, and percent of the column in each bin) so I was hoping that I don't need to do them all by hand. I have tried mutate in dplyr but I can't get it to pull from two different dataframes. I have also tried ifelse statements, but I would need to run the ifelse statement for each ID individually.
I don't know if what I am trying to do is possible but I appreciate the feedback. Thank you in advance!
Edit: my end goal is to find the max count (max ct) for each species within each bin for each ID. What I've been doing to find this (using the bins generated with suggestions from #Ben) is using dplyr to slice and find the max ID like this:
20180101_HH1_1_1 <- sp.c %>%
group_by(20180101_HH1_1_1, species) %>%
arrange(desc(ct)) %>%
slice(1) %>%
group_by(20180101_HH1_1_1) %>%
mutate(Count_Total_Per_Bin = sum(ct)) %>%
group_by(species, add=TRUE) %>%
mutate(species_percent_of_total_in_bin =
paste0((100*ct/Count_Total_Per_Bin) %>%
mutate(ID= "20180101_HH1_1_1 ") %>%
ungroup()
but I have to do this for over 100 IDs. My desired output would be something like:
end.goal <- data.frame(ID = c(rep("20180101_HH1_1_1", 8)),
species = c("RR", "GS", "SH", "GT", "RR", "BR", "RS", "BA"),
bin = c(1, 1, 1, 2, 3, 4, 4, 5),
Max_count_of_each_species_in_each_bin = c(11, 66, 500, 1, 6, 12, 30, 6),
percent_dist_from_surf = c(11, 15, 5, 33, 42, 68, 71, 100),
percent_each_species_max_in_each_bin = c((11/577)*100, (66/577)*100, (500/577)*100, 100, 100, (12/42)*100, (30/42)*100, 100))
I was thinking that by answering the original question I could get to this but I see now that there's still a lot you have to do to get this for each ID.
Here is another approach, which does not require a loop.
Using sapply you can cut to determine bins for each percent_dist_from_surf value in your deploy dataframe.
res <- sapply(deploy$Percent_Column_in_each_bin, function(x) {
cut(sp.c$percent_dist_from_surf, seq(0, 100, by = x), include.lowest = TRUE, labels = 1:(100/x))
})
colnames(res) <- deploy$ID
cbind(sp.c, res)
Or using purrr:
library(purrr)
cbind(sp.c, imap(setNames(deploy$Percent_Column_in_each_bin, deploy$ID),
~ cut(sp.c$percent_dist_from_surf, seq(0, 100, by = .x), include.lowest = TRUE, labels = 1:(100/.x))
))
Output
species ct percent_dist_from_surf 20180101_HH1_1_1 20180101_HH1_1_2 20180101_HH1_1_3
1 RR 25 11 1 2 1
2 GS 66 15 1 2 1
3 GT 1 33 2 4 2
4 BR 12 68 4 7 3
5 RS 30 71 4 8 3
6 BA 6 100 5 10 4
7 GS 1 2 1 1 1
8 RS 22 65 4 7 3
9 SH 500 5 1 1 1
10 RR 6 42 3 5 2
Edit:
To determine the maximum ct value for each species, site, and bin, put the result of above into a dataframe called res and do the following.
First would put into long form with pivot_longer. Then you can group_by species, site, and bin, and determine the maximum ct for this combination.
library(tidyverse)
res %>%
pivot_longer(cols = starts_with("2018"), names_to = "site", values_to = "bin") %>%
group_by(species, site, bin) %>%
summarise(max_ct = max(ct)) %>%
arrange(site, bin)
Output
# A tibble: 26 x 4
# Groups: species, site [21]
species site bin max_ct
<fct> <chr> <fct> <dbl>
1 GS 20180101_HH1_1_1 1 66
2 RR 20180101_HH1_1_1 1 25
3 SH 20180101_HH1_1_1 1 500
4 GT 20180101_HH1_1_1 2 1
5 RR 20180101_HH1_1_1 3 6
6 BR 20180101_HH1_1_1 4 12
7 RS 20180101_HH1_1_1 4 30
8 BA 20180101_HH1_1_1 5 6
9 GS 20180101_HH1_1_2 1 1
10 SH 20180101_HH1_1_2 1 500
11 GS 20180101_HH1_1_2 2 66
12 RR 20180101_HH1_1_2 2 25
13 GT 20180101_HH1_1_2 4 1
14 RR 20180101_HH1_1_2 5 6
15 BR 20180101_HH1_1_2 7 12
16 RS 20180101_HH1_1_2 7 22
17 RS 20180101_HH1_1_2 8 30
18 BA 20180101_HH1_1_2 10 6
19 GS 20180101_HH1_1_3 1 66
20 RR 20180101_HH1_1_3 1 25
21 SH 20180101_HH1_1_3 1 500
22 GT 20180101_HH1_1_3 2 1
23 RR 20180101_HH1_1_3 2 6
24 BR 20180101_HH1_1_3 3 12
25 RS 20180101_HH1_1_3 3 30
26 BA 20180101_HH1_1_3 4 6
It is helpful to distinguish between the contents of your two dataframes.
df2 appears to contain measurements from some sites
df1 appears to contain parameters by which you want to process/summarise the measurements in df2
Given these different purposes of the two dataframes, your best approach is probably to loop over all the rows of df1 each time adding a column to df2. Something like the following:
max_dist = max(df2$percent_dist_from_surf)
for(ii in 1:nrow(df1)){
# extract parameters
this_ID = df1[[ii,"ID"]]
this_depth = df1[[ii,"Site_Depth"]]
this_bins = df1[[ii,"Num_Depth_Bins_Required"]]
this_percent = df1[[ii,"Percent_Column_in_each_bin"]]
# add column to df2
df2 = df2 %>%
mutate(!!sym(this_ID) := insert_your_calculation_here)
}
The !!sym(this_ID) := part of the code is to allow dynamic naming of your output columns.
And as best I can determine the formula you want for insert_your_calculation_here is ceil(percent_dist_from_surf / max_dist * this_bins)

Compare one variable to other variables by group in R

I have the following data frame:
data.frame(id = c("a", "a", "a", "d", "d"),
value = c(5, 46, 12, 14, 32),
low = c(46, 8, NA, 0, 34),
high = c(56, 20, NA, 12, 60))
id value low high
1 a 5 46 56
2 a 46 8 20
3 a 12 NA NA
4 d 14 0 12
5 d 32 34 60
I need to set a new variable to TRUE if value is out of every intervals defined by low and high for each line with the same id.
My desired dataframe would be:
id value low high result
1 a 5 45 56 TRUE # 5 not in 45-56, 8-20
2 a 46 8 20 FALSE # 46 in 45-56
3 a 12 NA NA FALSE # 12 in 8-20
4 d 14 0 12 TRUE # 14 not in 0-12, 34-60
5 d 32 34 60 TRUE # 32 not in 0-12, 34-60
How can I do it in base R? I work in a restrictive environment where I only have access to base R.
I figured out an ugly and not optimized solution but it works ! Here is the code :
df <- data.frame(id = c("a", "a", "a", "d", "d"),
value = c(5, 46, 12, 14, 32),
low = c(46, 8, NA, 0, 34),
high = c(56, 20, NA, 12, 60))
list.inter <- list()
for(i in 1:nrow(df)){
if(is.na(df$low[i]) | is.na(df$low[i])) {
list.inter[[i]] <- NA
}else{
list.inter[[i]] <- seq(from = df$low[i], to = df$high[i])
}
}
result <- c()
for(i in 1:nrow(df)){
result[i] <- ! df$value[i] %in% unlist(list.inter[which(df$id[i]==df$id)])
}
df$result <- result
I hope it helps and I am curious to see some optimized code from other users!
Without apply, sapply and map function:
isInDataframe <- function(data = data, value = "value", from = "low", to = "high", id = "id"){
result <- c()
for (i in 1:length(data[,1])) {
deeta <- data[data[id] == as.character(data[id][i,1]),]
subresult <- c()
for (j in 1:nrow(deeta)) {
subresult[j] <- (data[value][i,1] >= deeta[from][j,1] & data[value][i,1] <= deeta[to][j,1])
}
result[i] <- !any(subresult,na.rm = T)
}
data$result <- result
return(data)
}
isInDataframe(data = data, value = "value", from = "low", to = "high", id = "id")
id value low high result
1 a 5 46 56 TRUE
2 a 46 8 20 FALSE
3 a 12 NA NA FALSE
4 d 14 0 12 TRUE
5 d 32 34 60 TRUE
I finally choose to separate id and value in a data frame and id, low and high in another data frame for this analysis.
However, here is a solution highly inspired from the solutions suggested for this new approach:
df <- data.frame(id = c("a", "a", "a", "d", "d"),
value = c(5, 46, 12, 14, 32),
low = c(46, 8, NA, 0, 34),
high = c(56, 20, NA, 12, 60))
temp <- merge(x = df[c("id",
"value")],
y = df[c("id",
"low",
"high")])
temp$result <- temp$value < temp$low | temp$value > temp$high
merge(x = df,
y = aggregate(formula = result ~ id + value,
data = temp,
FUN = all))
id value low high result
1 a 12 NA NA FALSE
2 a 46 8 20 FALSE
3 a 5 46 56 TRUE
4 d 14 0 12 TRUE
5 d 32 34 60 TRUE

Resources