add rows to data.frame that are calculation from existing rows - r

Given
x = data.frame(value = 1:3, col2 = c('min', 'min', 'max'), id = c(1, 2, 1))
Is there any way to group by col3 and then average min and max then store the result in col1?
value col2 id
1 1 min 1
2 2 min 2
3 3 max 1
4 2 avg 1 ## row I'd like to add. where "avg" identifies average of min and max

If you only have those three columns, and each group only has 2 values (min and max) then,
library(dplyr)
x %>%
group_by(col3) %>%
summarise(col1 = mean(col1), col2 = 'Average') %>%
bind_rows(x) %>%
arrange(col3)
which gives,
# A tibble: 5 x 3
col3 col1 col2
<dbl> <dbl> <chr>
1 1 2 Average
2 1 1 min
3 1 3 max
4 2 2 Average
5 2 2 min

I think you are looking for this :
library(dplyr)
x = data.frame(value = 1:3, col2 = c('min', 'min', 'max'), id = c(1, 2, 1))
x %>% group_by(id) %>% summarize(avg=mean(value)
This will give you the values you want, but won't append them to your dataframe.
With this few context, I'm not sure you should do so but you can rbind it if you really want to.

Related

how to select rows from lists in r?

df1 = data.frame(
y1 = c(1, -2, 3),
y2 = c(4, 5, 6),out = c("*", "*", "")
)
Create another dataframe
df2 = data.frame(
y1 = c(7, -3, 9),
y2 = c(1, 4, 6),out = c("", "*", "")
)
Create list of data frame using list()
lis = list(df1, df2)
I want to compare row by row (first row from index [[1]] with first row from index [[2]] and so on
if both rows have * or empty (similar) under out, then select the row with the highest absolute value in y1 (and put under ind the index)
otherwise, take the y1 value of the row with * under out (and put under ind the index)
example of desired output:
y1 y2 out ind
1 4 * 1
-3 4 * 2
9 6 2
An option is to bind the dataset with bind_rows and specify .id to create an identifier for the list sequence. Then grouped by the rowid of the 'ind' column, either we arrange the rows in the group so that * values in out will be first, then do the order on absolute values of 'y1' in descending. We slice the row with the max absolute value in 'y1' if the number of distinct elements in 'out' is 1 or else return the 1st row (will be * in out if there is one)
library(dplyr)
library(data.table)
bind_rows(lis, .id = 'ind') %>%
group_by(rn = rowid(ind)) %>%
arrange(out != "*", desc(abs(y1)), .by_group = TRUE) %>%
slice(if(n_distinct(out) == 1) which.max(abs(y1)) else 1) %>%
ungroup %>%
select(y1, y2, out, ind)
-output
# A tibble: 3 × 4
y1 y2 out ind
<dbl> <dbl> <chr> <chr>
1 1 4 "*" 1
2 -3 4 "*" 2
3 9 6 "" 2
Or without arrange, after grouping, create a position index in slice ('i1') where the values in out are *, then use that index to subset the y1 values to find the position where it is absolute max in else case and return the position of 'i1' by indexing
bind_rows(lis, .id = 'ind') %>%
group_by(rn = rowid(ind)) %>%
slice({i1 <- which(out == "*")
if(n_distinct(out) == 1) which.max(abs(y1)) else
i1[which.max(abs(y1[i1]))]}) %>%
ungroup %>%
select(y1, y2, out, ind)
-output
# A tibble: 3 × 4
y1 y2 out ind
<dbl> <dbl> <chr> <chr>
1 1 4 "*" 1
2 -3 4 "*" 2
3 9 6 "" 2
Or use a similar ordering approach with data.table
library(data.table)
dt1 <- rbindlist(lis, idcol = "ind")[, rn := rowid(ind)]
dt2 <- dt1[order(rn, out != "*", desc(abs(y1)))]
dt2[dt2[, .I[if(uniqueN(out) == 1) which.max(abs(y1)) else
1L], .(rn)]$V1][, rn := NULL][]
ind y1 y2 out
1: 1 1 4 *
2: 2 -3 4 *
3: 2 9 6

Sum column based on the strings in the other two columns, strings can be in either columns [duplicate]

This question already has an answer here:
R sum observations by unique column PAIRS (B-A and A-B) and NOT unique combinations (B-A or A-B)
(1 answer)
Closed 3 months ago.
Inputs:
group1 <- c("A", "B", "C", "D")
group2 <- c("B", "A", "D", "C")
count <- c(1, 3, 2, 4)
df <- data.frame(group1, group2, count)
df:
group1 group2 count
1 A B 1
2 B A 3
3 C D 2
4 D C 4
Desired output:
group total
1 AB or BA 4
2 CD or DC 6
My actual dataset has a very long list of these group pairs.
Sort the strings so the alphabetically first one is in a specific column:
library(dplyr)
df %>%
mutate(g1 = pmin(group1, group2),
g2 = pmax(group1, group2)) %>%
group_by(g1, g2) %>%
summarize(total = sum(count), .groups = "drop")
# # A tibble: 2 × 3
# g1 g2 total
# <chr> <chr> <dbl>
# 1 A B 4
# 2 C D 6
This works if the pairs are always in adjacent rows:
library(tidyverse)
df %>%
mutate(
# create row ID for each pair:
id = (row_number() - 1) %/% 2,
# create column `group`:
group = str_c(group1, group2, " or ", group2, group1)) %>%
group_by(id) %>%
summarise(across(group, first),
total = sum(count)) %>%
select(-id)
# A tibble: 2 × 2
group total
<chr> <dbl>
1 AB or BA 4
2 CD or DC 6

Append first row by group to original data

I have data with a grouping variable "ID" and some values:
ID, Value
1, 1
1, 2
1, 3
1, 4
2, 5
2, 6
2, 7
2, 8
Within each group, I want to append the first row after the last row.
Desired result:
ID, Value
1, 1
1, 2
1, 3
1, 4
1, 1 # First row of ID 1 inserted as last row in the group
2, 5
2, 6
2, 7
2, 8
2, 5 # First row of ID 2 inserted as last row in the group
I have 5000 row of this.
Within tidyverse, you could use add_row with do (now deprecated) or group_modify (experimental):
dat |>
group_by(ID) |>
do(add_row(., ID = unique(.$ID), Value = first(.$Value))) |>
ungroup()
dat |>
group_by(ID) |>
group_modify(~ add_row(., Value = first(.$Value))) |>
ungroup()
Or bind_rows with summarize (my variation of #Gregor Thomas, thanks):
dat |>
group_by(ID) |>
summarize(bind_rows(cur_data(), head(cur_data(), 1))) |>
ungroup()
Or by applying the same logic of #Henrik using bind_rows, filter, and arrange:
dat |>
bind_rows(dat |> filter(!duplicated(ID))) |>
arrange(ID)
Output:
# A tibble: 10 × 2
ID Value
<int> <dbl>
1 1 1
2 1 2
3 1 3
4 1 4
5 1 1
6 2 5
7 2 6
8 2 7
9 2 8
10 2 5
Thanks to #SamR for the data.
And with data.table:
library(data.table)
dt <- data.table(ID = rep(1:2, each = 4), Value = 1:8)
dt[,.(Value = c(Value, first(Value))), ID]
#> ID Value
#> 1: 1 1
#> 2: 1 2
#> 3: 1 3
#> 4: 1 4
#> 5: 1 1
#> 6: 2 5
#> 7: 2 6
#> 8: 2 7
#> 9: 2 8
#> 10: 2 5
Benchmarking with a 5000-row table:
library(dplyr)
dt <- data.table(ID = rep(1:1250, each = 4), Value = 1:5e3)
f1 <- function(dt) dt[,.(Value = c(Value, first(Value))), ID]
# base R
f2 <- function(dt) do.call(rbind, lapply(split(dt, by = "ID"), function(x) rbind(x, x[1,])))
# tidyverse
f3 <- function(dt) {
dt %>%
group_by(ID) %>%
do(add_row(., ID = unique(.$ID), Value = first(.$Value))) %>%
ungroup()
}
f4 <- function(dt) {
dt %>%
group_by(ID) %>%
group_modify(~ add_row(., Value = first(.$Value))) %>%
ungroup()
}
microbenchmark::microbenchmark(data.table = f1(dt),
"base R" = f2(dt),
tidyverse1 = f3(dt),
tidyverse2 = f4(dt),
times = 10)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> data.table 3.4989 3.6844 4.16619 4.3085 4.4623 5.3020 10
#> base R 245.1397 263.1636 283.61131 284.8053 307.7105 310.3901 10
#> tidyverse1 761.7097 773.3705 791.05115 787.9463 808.5416 821.5321 10
#> tidyverse2 711.9593 716.4959 752.20273 728.2170 782.6474 837.1926 10
If speed is really important, this simple Rcpp function provides a very fast solution:
Rcpp::cppFunction(
"IntegerVector firstLast(const IntegerVector& x) {
const int n = x.size();
IntegerVector idxOut(2*n);
int i0 = 1;
int idx = 0;
idxOut(0) = 1;
for (int i = 1; i < n; i++) {
if (x(i) != x(i - 1)) {
idxOut(++idx) = i0;
i0 = i + 1;
}
idxOut(++idx) = i + 1;
}
idxOut(++idx) = i0;
return idxOut[Rcpp::Range(0, idx)];
}"
)
Benchmarking against the fastest solution from this answer (on a much larger dataset):
dt = data.table(ID = rep(1:125e4, each = 4), Value = 1:5e6)
microbenchmark::microbenchmark(
f_uniq_dt = setorder(rbindlist(list(dt, unique(dt, by = "ID"))), ID),
f_Rcpp = dt[firstLast(dt$ID)],
check = "equal"
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> f_uniq_dt 78.6056 83.71345 95.42876 85.80720 90.03685 175.8867 100
#> f_Rcpp 49.1485 53.38275 60.96322 55.44925 58.01485 121.3637 100
Use !duplicated to get first row by "ID" (more efficient than "by group" operations). rbind with original data and order result:
df = data.frame(ID = rep(1:2, each = 4), Value = 1:8)
d2 = rbind(df, df[!duplicated(df$ID), ])
d2[order(d2$ID), ]
# ID Value
# 1 1 1
# 2 1 2
# 3 1 3
# 4 1 4
# 11 1 1
# 5 2 5
# 6 2 6
# 7 2 7
# 8 2 8
# 51 2 5
Same idea with data.table::duplicated:
d = as.data.table(df)
d2 = rbindlist(list(d, d[!duplicated(d, by = "ID")]))
setorder(d2, ID)
More straightforward with data.table::unique:
d2 = rbindlist(list(d, unique(d, by = "ID")))
setorder(d2, ID)
Also data.table::rowid:
d2 = rbindlist(list(d, d[rowid(ID) == 1]))
setorder(d2, ID)
By avoiding "by group" operations, !duplicated, unique and rowid alternatives are all faster than the clear winner in the previous benchmark, data.table solution which uses by, on a 5000 row data set (see OP):
df = data.frame(ID = rep(1:1250, each = 4), Value = 1:5e3)
d = as.data.table(d)
microbenchmark(
f_by = {
d1 = d[ , .(Value = c(Value, first(Value))), by = ID]
},
f_dupl_df = {
d2 = rbind(df, df[!duplicated(df$ID), ])
d2 = d2[order(d2$ID), ]
},
f_dupl_dt = {
d3 = rbindlist(list(d, d[!duplicated(d, by = "ID")]))
setorder(d3, ID)
},
f_uniq_dt = {
d4 = rbindlist(list(d, unique(d, by = "ID")))
setorder(d4, ID)
},
f_rowid = {
d5 = rbindlist(list(d, d[rowid(ID) == 1]))
setorder(d5, ID)
},
times = 10L)
# Unit: milliseconds
# expr min lq mean median uq max
# f_by 8.5167 9.1397 11.01410 9.90925 12.3327 15.9134
# f_dupl_df 6.8337 7.0901 8.31057 7.56810 8.4899 13.9278
# f_dupl_dt 2.4742 2.6687 3.24932 3.18670 3.7993 4.3318
# f_uniq_dt 2.2059 2.4225 3.50756 3.36250 4.4590 5.6632
# f_rowid 2.2963 2.4295 3.43876 2.74345 4.8035 5.9278
all.equal(d1, as.data.table(d2))
all.equal(d1, d3)
all.equal(d1, d4)
all.equal(d1, d5)
# [1] TRUE
However, a sub-second benchmark isn't very informative, so try on larger data with many groups. The base solution loses ground. data.table::duplicated, unique and rowid scale better, and are now about 20 times faster, data.table::unique being fastest.
df = data.frame(ID = rep(1:1250000, each = 4), Value = 1:5e6)
d = as.data.table(df)
# Unit: milliseconds
# expr min lq mean median uq max neval
# f_by 6834.5959 7157.1686 12273.2399 7775.3919 8850.5324 35339.0262 10
# f_dupl_df 10732.1536 11035.4886 19440.4964 11691.5347 37956.6961 38387.4927 10
# f_dupl_dt 174.5640 183.8399 391.8605 381.8920 401.4929 962.4948 10
# f_uniq_dt 156.1267 161.9555 212.3472 180.7912 209.3905 406.7780 10
# f_rowid 192.1106 197.1564 380.0023 234.5851 474.5024 1172.6529 10
For completeness, a binary search, with mult = "first" to select the first match:
d[.(unique(ID)), on = .(ID), mult = "first"]
However, in both timings above, it ended on twice the time compared to the unique alternative.
Here is a base R method:
do.call(rbind,
lapply(split(dat, dat$ID), \(id_df) rbind(id_df, id_df[1,]))
)
# ID Value
# 1.1 1 1
# 1.2 1 2
# 1.3 1 3
# 1.4 1 4
# 1.5 1 1
# 2.5 2 5
# 2.6 2 6
# 2.7 2 7
# 2.8 2 8
# 2.51 2 5
It does give you slightly strange row names - if you care about that you can wrap it in (or pipe it to) tibble::as_tibble(), which removes row names altogether.
Alternatively you could do data.table::rbindlist(lapply(split(dat, dat$ID), \(id_df) rbind(id_df, id_df[1,]))), becausedata.table also does not use row names.
Data
dat <- read.csv(text = "ID, Value
1, 1
1, 2
1, 3
1, 4
2, 5
2, 6
2, 7
2, 8", h=T)
You can try the following data.table option
> setDT(df)[, .SD[(seq(1 + .N) - 1) %% .N + 1], ID]
ID Value
1: 1 1
2: 1 2
3: 1 3
4: 1 4
5: 1 1
6: 2 5
7: 2 6
8: 2 7
9: 2 8
10: 2 5

Row Minimum except certain columns

I have a data frame below. I need to find the the row min and max except few column that are characters.
df
x y z
1 1 1 a
2 2 5 b
3 7 4 c
I need
df
x y z Min Max
1 1 1 a 1 1
2 2 5 b 2 5
3 7 4 c 4 7
Another dplyr possibility could be:
df %>%
mutate(Max = do.call(pmax, select_if(., is.numeric)),
Min = do.call(pmin, select_if(., is.numeric)))
x y z Max Min
1 1 1 a 1 1
2 2 5 b 5 2
3 7 4 c 7 4
Or a variation proposed be #G. Grothendieck:
df %>%
mutate(Min = pmin(!!!select_if(., is.numeric)),
Max = pmax(!!!select_if(., is.numeric)))
Another base R solution. Subset only the columns with numbers and then use apply in each row to get the minimum and maximum value with range.
cbind(df, t(apply(df[sapply(df, is.numeric)], 1, function(x)
setNames(range(x, na.rm = TRUE), c("min", "max")))))
# x y z min max
#1 1 1 a 1 1
#2 2 5 b 2 5
#3 7 4 c 4 7
1) This one-liner uses no packages:
transform(df, min = pmin(x, y), max = pmax(x, y))
giving:
x y z min max
1 1 1 a 1 1
2 2 5 b 2 5
3 7 4 c 4 7
2) If you have many columns and don't want to list them all or determine yourself which are numeric then this also uses no packages.
ix <- sapply(df, is.numeric)
transform(df, min = apply(df[ix], 1, min), max = apply(df[ix], 1, max))
If your actual data has NAs and if you want to ignore them when taking the min or max then min, max, pmin and pmax all take an optional na.rm = TRUE argument.
Note
Lines <- "x y z
1 1 1 a
2 2 5 b
3 7 4 c"
df <- read.table(text = Lines)
1) We can use select_if. Here, we can use select_if to select the columns that are numeric, then with pmin, pmax get the rowwise min and max and bind it with the original dataset
library(dplyr)
library(purrr)
df %>%
select_if(is.numeric) %>%
transmute(Min = reduce(., pmin, na.rm = TRUE),
Max = reduce(., pmax, na.rm = TRUE)) %>%
bind_cols(df, .)
# x y z Min Max
#1 1 1 a 1 1
#2 2 5 b 2 5
#3 7 4 c 4 7
NOTE: Here, we use only a single expression of select_if
2) The same can be done in base R (no packages used)
i1 <- names(which(sapply(df, is.numeric)))
df['Min'] <- do.call(pmin, c(df[i1], na.rm = TRUE))
df['Max'] <- do.call(pmax, c(df[i1], na.rm = TRUE))
Also, as stated in the comments, this is generalized option. If it is only for two columns, just doing pmin(x, y) or pmax(x,y) is possible and that wouldn't check if the columns are numeric or not and it is not a general solution
NOTE: All of the solutions mentioned here are either answered first or from the comments with the OP
data
df <- structure(list(x = c(1L, 2L, 7L), y = c(1L, 5L, 4L), z = c("a",
"b", "c")), class = "data.frame", row.names = c("1", "2", "3"
))

Removing groups from dataframe if variable has repeated values

I would like to ask if there is a way of removing a group from dataframe using dplyr (or anz other way in that matter) in the following way. Lets say I have a dataframe in the following form grouped by variable 1:
Variable 1 Variable 2
1 a
1 b
2 a
2 a
2 b
3 a
3 c
3 a
... ...
I would like to remove only groups that have in Variable 2 two consecutive same values. That is in table above it would remove group 2 because there are values a,a,b but not group c where is a,c,a. So I would get the table bellow?
Variable 1 Variable 2
1 a
1 b
3 a
3 c
3 a
... ...
To test for consecutive identical values, you can compare a value to the previous value in that column. In dplyr, this is possible with lag. (You could do the same thing with comparing to the next value, using lead. Result comes out the same.)
Group the data by variable1, get the lag of variable2, then add up how many of these duplicates there are in that group. Then filter for just the groups with no duplicates. After that, feel free to remove the dupesInGroup column.
library(tidyverse)
df %>%
group_by(variable1) %>%
mutate(dupesInGroup = sum(variable2 == lag(variable2), na.rm = T)) %>%
filter(dupesInGroup == 0)
#> # A tibble: 5 x 3
#> # Groups: variable1 [2]
#> variable1 variable2 dupesInGroup
#> <int> <chr> <int>
#> 1 1 a 0
#> 2 1 b 0
#> 3 3 a 0
#> 4 3 c 0
#> 5 3 a 0
Created on 2018-05-10 by the reprex package (v0.2.0).
prepare data frame:
df <- data.frame("Variable 1" = c(1, 1, 2, 2, 2, 3, 3, 3), "Variable 2" = unlist(strsplit("abaabaca", "")))
write functions to test if consecutive repetitions are there or not:
any.consecutive.p <- function(v) {
for (i in 1:(length(v) - 1)) {
if (v[i] == v[i + 1]) {
return(TRUE)
}
}
return(FALSE)
}
any.consecutive.in.col.p <- function(df, col) {
any.consecutive.p(df[, col])
}
any.consecutive.p returns TRUE if it finds first consecutive repetition in a vector (v).
any.consecutive.in.col.p() looks for consecutive repetitions in a column of a data frame.
split data frame by values of Variable.1
df.l <- split(df, df$Variable.1)
df.l
$`1`
Variable.1 Variable.2
1 1 a
2 1 b
$`2`
Variable.1 Variable.2
3 2 a
4 2 a
5 2 b
$`3`
Variable.1 Variable.2
6 3 a
7 3 c
8 3 a
Finally go over this data.frame list and test for each data frame, if it contains consecutive duplicates in Variable.2 column.
If found, don't collect it.
Bind the collected data frames by rows.
Reduce(rbind, lapply(df.l, function(df) if(!any.consecutive.in.col.p(df, "Variable.2")) {df}))
Variable.1 Variable.2
1 1 a
2 1 b
6 3 a
7 3 c
8 3 a
Say you want to remove all groups of df, grouped by a, where the column b has repeated values. You can do that as below.
set.seed(0)
df <- data.frame(a = rep(1:3, rep(3, 3)), b = sample(1:5, 9, T))
# dplyr
library(dplyr)
df %>%
group_by(a) %>%
filter(all(b != lag(b), na.rm = T))
#data.table
library(data.table)
setDT(df)
df[, if(all(b != shift(b), na.rm = T)) .SD, by = a]
Benchmark shows data.table is faster
#Results
# Unit: milliseconds
# expr min lq mean median uq max neval
# use_dplyr() 141.46819 165.03761 201.0975 179.48334 205.82301 539.5643 100
# use_DT() 36.27936 50.23011 64.9218 53.87114 66.73943 345.2863 100
# Method
set.seed(0)
df <- data.table(a = rep(1:2000, rep(1e3, 2000)), b = sample(1:1e3, 2e6, T))
use_dplyr <- function(x){
df %>%
group_by(a) %>%
filter(all(b != lag(b), na.rm = T))
}
use_DT <- function(x){
df[, if (all(b != shift(b), na.rm = T)) .SD, a]
}
microbenchmark(use_dplyr(), use_DT())

Resources