R data.table conditional sum by row - r

> tempDT <- data.table(colA = c("E","E","A","C","E","C","E","C","E"), colB = c(20,30,40,30,30,40,30,20,10), group = c(1,1,1,1,2,2,2,2,2), want = c(NA, 30, 40, 70,NA,40,70,20,30))
> tempDT
colA colB group want
1: E 20 1 NA
2: E 30 1 30
3: A 40 1 40
4: C 30 1 70
5: E 30 2 NA
6: C 40 2 40
7: E 30 2 70
8: C 20 2 20
9: E 10 2 30
I have columns 'colA' 'colB' 'group': within each 'group', I would like to sum up 'colB' from bottom up until 'colA' is 'E'.

Based on the expected 'want', we create a run-length-id column 'grp' by checking if the value is 'E' in 'colA', then create 'want1' as the cumulative sum of 'colB' after grouping by 'grp' and 'group', get the row index ('i1') of elements that are duplicated in 'colA' and also is 'E' and assign the 'colB' values to 'want1'
tempDT[, grp:= rleid(colA=="E") * (colA != "E")
][grp!= 0, want1 := cumsum(colB), .(grp, group)]
i1 <- tempDT[, .I[colA=="E" & duplicated(colA)], group]$V1
tempDT[i1, want1 := colB][, grp := NULL][]
# colA colB group want want1
#1: E 20 1 NA NA
#2: E 30 1 30 30
#3: A 40 1 40 40
#4: C 30 1 70 70
#5: E 30 2 NA NA
#6: C 30 2 30 30

There's one approach: row reference + sums
# input data
tempDT <- data.table(colA = c("E","E","A","C","E","C","E","C","E"), colB = c(20,30,40,30,30,40,30,20,10), group = c(1,1,1,1,2,2,2,2,2), want = c(NA, 30, 40, 70,NA,40,70,20,30))
tempDT
# find row reference previous row where colA is "E"
lastEpos <- function(i) tail(which(tempDT$colA[1:(i-1)] == "E"), 1)
tempDT[, rowRef := sapply(.I, lastEpos), by = "group"]
# sum up
sumEpos <- function(i) {
valTEMP <- tempDT$rowRef[i]
outputTEMP <- sum(tempDT$colB[(valTEMP+1):i]) # sum
return(outputTEMP)
}
tempDT[, want1 := sapply(.I, sumEpos), by = "group"]
# deal with first row in every group
tempDT[, want1 := c(NA, want1[-1]), by = "group"]
# clean output
tempDT[, rowRef := NULL]
tempDT

library(dplyr)
df %>%
group_by(group) %>%
mutate(row_num = n():1) %>%
group_by(group) %>%
mutate(sum_colB = sum(colB[row_num < row_num[which(colA=='E')]]),
flag = ifelse(row_num >= row_num[which(colA=='E')], 0, 1),) %>%
mutate(sum_colB = ifelse(flag==1 & row_num==1, sum_colB, ifelse(flag==0, NA, colB))) %>%
select(-flag, -row_num) %>%
data.frame()
Output is:
colA colB group want sum_colB
1 E 20 1 NA NA
2 E 30 1 30 NA
3 A 40 1 40 40
4 C 30 1 70 70
5 E 30 2 NA NA
6 C 30 2 30 30
Sample data:
df <- structure(list(colA = structure(c(3L, 3L, 1L, 2L, 3L, 2L), .Label = c("A",
"C", "E"), class = "factor"), colB = c(20, 30, 40, 30, 30, 30
), group = c(1, 1, 1, 1, 2, 2), want = c(NA, 30, 40, 70, NA,
30)), .Names = c("colA", "colB", "group", "want"), row.names = c(NA,
-6L), class = "data.frame")

Related

How to quickly count number of unique entries after merging tables

library(data.table)
table1 <- data.table(id1 = c(1324, 2324, 29, 29, 1010, 1010),
type = c(1, 1, 2, 1, 1, 1),
class = c("A", "A", "B", "D", "D", "A"),
number = c(1, 98, 100, 100, 70, 70))
table2 <- data.table(id2 = c(1998, 1998, 2000, 2000, 2000, 2010, 2012, 2012),
type = c(1, 1, 3, 1, 1, 5, 1, 1),
class = c("D", "A", "D", "D", "A", "B", "A", "A"),
min_number = c(34, 0, 20, 45, 5, 23, 1, 1),
max_number = c(50, 100, 100, 100, 100, 9, 10, 100))
> table1
id1 type class number
1: 1324 1 A 1
2: 2324 1 A 98
3: 29 2 B 100
4: 29 1 D 100
5: 1010 1 D 70
6: 1010 1 A 70
> table2
id2 type class min_number max_number
1: 1998 1 D 34 50
2: 1998 1 A 0 100
3: 2000 3 D 20 100
4: 2000 1 D 45 100
5: 2000 1 A 5 100
6: 2010 5 B 23 9
7: 2012 1 A 1 10
8: 2012 1 A 1 100
I have two tables, and I would like to merge them based on type, class, and whether number lies between min_number and max_number. Then I would like to create a new variable nMatch that stores the number of unique id2s that match with each id1.
setindexv(table2, c("type", "class"))
for (t1_row in seq_len(nrow(table1))) {
print(t1_row)
set(
table1, t1_row, "matches",
table2[table1[t1_row], on = c("type", "class", "max_number >= number", "min_number <= number"), .(list(id2))]
)
}
> table1[, .(nMatch = uniqueN(unlist(matches), na.rm = TRUE)), by = .(id1)]
id1 nMatch
1: 1324 2
2: 2324 3
3: 29 1
4: 1010 3
The approach above is row-by-row as suggested here, but my real dataset has millions of rows. What's another way of doing this that's faster?
You can try data.table with on = .(...) to merge two data tables
table1[
table2,
.(id1, id2),
on = .(type, class, number >= min_number, number <= max_number),
nomatch = NULL
][
,
.(nMatch = uniqueN(id2)),
id1
]
and will get
id1 nMatch
1: 1324 2
2: 1010 3
3: 2324 3
4: 29 1
An option with tidyverse
library(dplyr)
library(tidyr)
left_join(table1, table2, by =
join_by(type, class, number >= min_number, number <= max_number)) %>%
distinct(id1, id2) %>%
drop_na %>%
count(id1, name = "nMatch")
-output
id1 nMatch
<num> <int>
1: 29 1
2: 1010 3
3: 1324 2
4: 2324 3

Aggregating / rolling up specific records "into" subsequent records

I am trying to aggregate records with a specific type into subsequent records.
I have a dataset similar to the following:
df_initial <- data.frame("Id" = c(1, 2, 3, 4, 5),
"Qty" = c(105, 110, 100, 115, 120),
"Type" = c("A", "B", "B", "A", "A"),
"Difference" = c(30, 34, 32, 30, 34))
After sorting on the Id field, I'd like to aggregate records of Type = "B" into the next record of type = "A".
In other words, I'm looking to create df_new, which adds the Qty and Difference values for Ids 2 and 3 into the Qty and Difference values for Id 4, and flags Id 4 as being adjusted (in the field AdjustedFlag).
df_new <- data.frame("Id" = c(1, 4, 5),
"Qty" = c(105, 325, 120),
"Type" = c("A", "A", "A"),
"Difference" = c(30, 96, 34),
"AdjustedFlag" = c(0, 1, 0))
I'd greatly appreciate any advice or ideas about how to do this in R, preferably using data.table.
A data.table solution:
df_initial[, .(
Id = Id[.N], Qty = sum(Qty),
Difference = sum(Difference),
AdjustedFlag = +(.N > 1)
), by = .(grp = rev(cumsum(rev(Type == "A"))))
][, grp := NULL][]
# Id Qty Difference AdjustedFlag
# <num> <num> <num> <int>
# 1: 1 105 30 0
# 2: 4 325 96 1
# 3: 5 120 34 0
This can be solved by creating a new grouping variable, that groups the rows into the groups you describe, with the idea being to utilize that grouping variable for the desired aggregation.
Instead of having
A B B A A
that new grouping variable should look something like this:
1 2 2 2 3
This is not a data.table solution, but the same logic could be applied there:
library(tidyverse)
df_initial |>
mutate(
type2 = ifelse(Type == "A", as.numeric(factor(Type)), 0),
type2 = cumsum(type2),
type2 = ifelse(Type == "B", NA, type2)
) |>
fill(type2, .direction = "up") |>
group_by(type2) |>
summarise(
id = max(Id),
Qty = sum(Qty),
Difference = sum(Difference),
AdjustedFlag = as.numeric(n() > 1)
)
#> # A tibble: 3 × 5
#> type2 id Qty Difference AdjustedFlag
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 105 30 0
#> 2 2 4 325 96 1
#> 3 3 5 120 34 0
Using tidyverse
df_initial %>%
mutate(gn = if_else(lag(Type, default = 'A') == 'B' | Type == 'B', 'B', Type),
gr = cumsum(lag(gn, default = 'A') != gn),
adjusted = if_else(lag(Type, default = 'A') == 'B' | Type == 'B', 1, 0)) %>%
group_by(gr) %>%
summarise(Id = last(Id),
Qty = sum(Qty),
Type = 'A',
Difference = sum(Difference),
Adjusted_flg = max(adjusted)) %>% ungroup()
Here we create an interim dataset that looks like:
Id Qty Type Difference gn gr Adjusted
1 1 105 A 30 A 0 0
2 2 110 B 34 B 1 0
3 3 100 B 32 B 1 0
4 4 115 A 30 B 1 1
5 5 120 A 34 A 2 0
And use this to create our final table within the summarise. The gr is a column for indicating a group of values, which is why we group_by it.

How to check previous row value with present row value in dataframe

How to check previous row value with present row value dynamically for all column of data frame by grouping specific ID.
my data frame:
ID ITEM1 ITEM2 ITEM3
1 A A A
2 C B C
1 A B C
1 B A C
2 NA B F
3 A A D
4 R G J
4 H T J
For Ex:
ID ITEM1 ITEM2 ITEM3 ITEM1change ITEM2change ITEM3change
1 A A A 0 0 0
1 A B C 0 1 1
1 B A C 1 1 0
2 C B C 0 0 0
2 NA B F 1 0 1
3 A A D 0 0 0
4 R G J 0 0 0
4 H T J 1 1 0
My final output will be:
Fiels modifiedcout unmodifiedcount Total
ITEM1change 3 5 8
ITEM2change 3 5 8
ITEM3change 2 6 8
my data:
structure(list(ID = c(1, 2, 1, 1, 2, 3, 4, 4), ITEM1 = structure(c(1L,
3L, 1L, 2L, NA, 1L, 5L, 4L), .Label = c("A", "B", "C", "H", "R"
), class = "factor"), ITEM2 = structure(c(1L, 2L, 2L, 1L, 2L,
1L, 3L, 4L), .Label = c("A", "B", "G", "T"), class = "factor"),
ITEM3 = structure(c(1L, 2L, 2L, 2L, 4L, 3L, 5L, 5L), .Label = c("A",
"C", "D", "F", "J"), class = "factor")), .Names = c("ID",
"ITEM1", "ITEM2", "ITEM3"), row.names = c(NA, -8L), class = "data.frame")
A possible solution:
library(dplyr)
library(tidyr)
df %>%
gather(item, value, -1) %>%
group_by(ID, item) %>%
mutate(change = lag(value, default = first(value)) != value,
change = replace(change, is.na(change), TRUE)) %>%
group_by(item) %>%
summarise(modified = sum(change, na.rm = TRUE),
unmodified = sum(!change, na.rm = TRUE)) %>%
mutate(total = modified + unmodified)
which gives:
# A tibble: 3 x 4
item modified unmodified total
<chr> <int> <int> <int>
1 ITEM1 3 5 8
2 ITEM2 3 5 8
3 ITEM3 2 6 8
Here is another idea using rollapply from zoo. By using rollapply with width = 2, we are testing if x is not equal with x-1. Wrapping it in as.integer gives 1s (TRUE) and 0s (FALSE). We then replace all NAs with 1 since you consider them as being modified, and use colSums to sum the modified/unmodified elements. The total is just the number of rows of the original data frame.
library(zoo)
m1 <- do.call(rbind, lapply(split(df, df$ID), function(i)
sapply(i[-1], function(j)
as.integer(c(FALSE, rollapply(j, 2, function(k) k[1] != k[2]))))))
m1 <- replace(m1, is.na(m1), 1)
#giving
# ITEM1 ITEM2 ITEM3
# 0 0 0
# 0 1 1
# 1 1 0
# 0 0 0
# 1 0 1
#3 0 0 0
# 0 0 0
# 1 1 0
To get your expected data frame,
final_df <- data.frame(modified = colSums(m1 == 1),
unmodified = colSums(m1 != 1),
Total = nrow(df), stringsAsFactors = FALSE)
which gives,
modified unmodified Total
ITEM1 3 5 8
ITEM2 3 5 8
ITEM3 2 6 8
The given data has many columns of the same type. This strongly suggests that the data better be stored in long format rather than in wide format.
Jaap's solution is reshaping the data using tidyr / dplyr.
However, I would like to suggest a data.tablesolution which does not reshape the data. In addition, it avoids to handle NA values separately.
library(data.table)
# coerce to data.table, loop over columns and determine changes to previous row by ID
tmp <- setDT(DF)[, lapply(.SD, function(x) x == shift(x, fill = x[1])), by = ID]
tmp
ID ITEM1 ITEM2 ITEM3
1: 1 TRUE TRUE TRUE
2: 1 TRUE FALSE FALSE
3: 1 FALSE FALSE TRUE
4: 2 TRUE TRUE TRUE
5: 2 NA TRUE FALSE
6: 3 TRUE TRUE TRUE
7: 4 TRUE TRUE TRUE
8: 4 FALSE FALSE TRUE
Now, we can count the unchanged rows:
tmp[, lapply(.SD, sum, na.rm = TRUE), .SDcols = -"ID"]
ITEM1 ITEM2 ITEM3
1: 5 5 6
From here, OP's expected result can be achieved in two different ways
using melt()
melt(tmp[, lapply(.SD, sum, na.rm = TRUE), .SDcols = -"ID"]
, measure.vars = patterns("^ITEM"),
variable.name = "item",
value.name = "unmodified")[
, c("modified", "Total") := .(nrow(DF) - unmodified, nrow(DF))][]
or by transposing:
as.data.table(
t(tmp[, lapply(.SD, sum, na.rm = TRUE), .SDcols = -"ID"])
, keep.rownames = "item")[, setnames(.SD, "V1", "unmodified")][
, c("modified", "Total") := .(nrow(DF) - unmodified, nrow(DF))][]
Both return the same result:
item unmodified modified Total
1: ITEM1 5 3 8
2: ITEM2 5 3 8
3: ITEM3 6 2 8
For the sake of completeness, here is also a data.table implementation of the reshape approach. As above, NA are handled by counting the unmodified rows first excluding any NA.
melt(setDT(DF), id.vars = "ID", variable.name = "item")[
, value == shift(value, fill = value[1L]), by = .(ID, item)][
, .(unmodified = sum(V1, na.rm = TRUE)), by = item][
, c("modified", "Total") := .(nrow(DF) - unmodified, nrow(DF))][]
If dat is your data, then try:
Create ITEMCHANGE variables
dat["ITEM1Change"] <- c(NA, head(dat["ITEM1"], dim(dat)[1] - 1)[[1]])
dat["ITEM2Change"] <- c(NA, head(dat["ITEM2"], dim(dat)[1] - 1)[[1]])
dat["ITEM3Change"] <- c(NA, head(dat["ITEM3"], dim(dat)[1] - 1)[[1]])
Then compare if there are changes
dat$ITEM1Change <- ifelse(dat$ITEM1Change == dat$ITEM1, 0, 1)
dat$ITEM2Change <- ifelse(dat$ITEM2Change == dat$ITEM2, 0, 1)
dat$ITEM3Change <- ifelse(dat$ITEM3Change == dat$ITEM2, 0, 1)
Then group and summarize
library(dplyr)
dat %>%
group_by("ITEM1") %>%
summarise_at(.funs = sum, .vars = "ITEM1Change") -> ITEM1Change
etc.
Is this what you need?

Round conditionally numbers in R

How can I round conditionally the values of a column in a dataframe in R? I need to round to the lower 10 from 0-89 and not from 90-100. For example:
ID value
A 15
B 47
C 91
D 92
has to be changed to
ID value
A 10
B 40
C 91
D 92
so, no changes for C/D and A/B rounded down
Any ideas?
Thanks
You can do it like this:
df$value[df$value < 90] <- floor(df$value[df$value < 90] / 10) * 10
# ID value
# 1 A 10
# 2 B 40
# 3 C 91
# 4 D 92
As a reminder, here is your data:
df <- structure(list(ID = c("A", "B", "C", "D"), value = c(15L, 47L,
91L, 92L)), .Names = c("ID", "value"), class = "data.frame", row.names = c(NA,
-4L))
Other solution using data.table:
library(data.table)
setDT(df)[, value:= as.numeric(value)][value<90, value:= floor(value/10) * 10]
# ID value
# 1: A 10
# 2: B 40
# 3: C 91
# 4: D 92
You could do:
df$value <- with(df, ifelse(value %in% c(0:89), value-(value%%10), value))

How to use R to get all pairs from two column with index

I would like to use R to get all pairs from two column with index. It may need some loop to finish this function. For example, turn two columns with the gene name and index:
a 1,
b 1,
c 1,
d 2,
e 2
into a new matrix
a b 1,
b c 1,
a c 1,
d e 2
Can anyone help?
A tidyverse option using combn on a grouped data.frame:
library(tidyverse)
df %>% group_by(index) %>%
summarise(gene = list(as_data_frame(t(combn(gene, 2))))) %>%
unnest(.sep = '_')
## # A tibble: 4 × 3
## index gene_V1 gene_V2
## <int> <chr> <chr>
## 1 1 a b
## 2 1 a c
## 3 1 b c
## 4 2 d e
The same logic can be replicated in base R:
df2 <- aggregate(gene ~ index, df, function(x){t(combn(x, 2))})
do.call(rbind, apply(df2, 1, data.frame))
## index gene.1 gene.2
## 1 1 a b
## 2 1 a c
## 3 1 b c
## 4 2 d e
Data
df <- structure(list(gene = c("a", "b", "c", "d", "e"), index = c(1L,
1L, 1L, 2L, 2L)), .Names = c("gene", "index"), row.names = c(NA,
-5L), class = "data.frame")
Here is an option using data.table. Convert the 'data.frame' to 'data.table' (setDT(df)), grouped by 'index', we get the combn of 'gene', transpose it and set the names of the 2nd and 3rd column (if needed).
library(data.table)
setnames(setDT(df)[, transpose(combn(gene, 2, FUN = list)),
by = index], 2:3, paste0("gene", 1:2))[]
# index gene1 gene2
#1: 1 a b
#2: 1 a c
#3: 1 b c
#4: 2 d e

Resources