Assign specific value based on two column in R - r

I have a dataframe which looks like:
Student_ID Number Position
VB-123 10 2
VB-456 15 5
VB-789 25 25
VB-889 12 2
VB-965 15 7
VB-758 45 9
VB-245 25 25
I want to add new column and assign a value based on below conditions:
If only Number is duplicate in entire dataframe then Assign A
If only Position is duplicate in entire dataframe then assign B
If both Number and Position are duplicate then assign C
If none of the duplicate then assign D.
Output would looks like:
Student_ID Number Position Assign
VB-123 10 2 B
VB-456 15 5 A
VB-789 25 25 C
VB-889 12 2 B
VB-965 15 7 A
VB-758 45 9 D
VB-245 25 25 C

With dplyr,
library(dplyr)
students <- data.frame(Student_ID = c("VB-123", "VB-456", "VB-789", "VB-889", "VB-965", "VB-758", "VB-245"),
Number = c(10L, 15L, 25L, 12L, 15L, 45L, 25L),
Position = c(2L, 5L, 25L, 2L, 7L, 9L, 25L))
students2 <- students %>%
mutate_at(vars(Number, Position), funs(n = table(.)[as.character(.)])) %>%
mutate(Assign = case_when(Number_n > 1 & Position_n > 1 ~ 'C',
Number_n > 1 ~ 'A',
Position_n > 1 ~ 'B',
TRUE ~ 'D'))
students2
#> Student_ID Number Position Number_n Position_n Assign
#> 1 VB-123 10 2 1 2 B
#> 2 VB-456 15 5 2 1 A
#> 3 VB-789 25 25 2 2 C
#> 4 VB-889 12 2 1 2 B
#> 5 VB-965 15 7 2 1 A
#> 6 VB-758 45 9 1 1 D
#> 7 VB-245 25 25 2 2 C
As an alternative to the mutate_at line, you could use add_count twice, renaming as necessary. To remove the intermediary columns, tack on select(-matches('_n$')).
You can more or less replicate the logic in base by assigning to subsets:
students2 <- cbind(students, lapply(students[2:3], function(x) table(x)[as.character(x)]))
students2$Assign <- 'D'
students2$Assign[students2$Number.Freq > 1 & students2$Position.Freq > 1] <- 'C'
students2$Assign[students2$Number.Freq > 1 & students2$Position.Freq == 1] <- 'A'
students2$Assign[students2$Number.Freq == 1 & students2$Position.Freq > 1] <- 'B'
students2[4:7] <- NULL
students2
#> Student_ID Number Position Assign
#> 1 VB-123 10 2 B
#> 2 VB-456 15 5 A
#> 3 VB-789 25 25 C
#> 4 VB-889 12 2 B
#> 5 VB-965 15 7 A
#> 6 VB-758 45 9 D
#> 7 VB-245 25 25 C

Here is an option using base R. Create a list of column names as in the order of evaluatin ('l1'), pre assign 'D' to create the 'Assign' column in 'dat', loop through the sequence of 'l1', subset the columns of data based on the column names in 'l1', use duplicated to find the duplicate elements and reassign the 'Assign' column to the corresponding LETTER
l1 <- list("Number", "Position", c("Number", "Position"))
dat$Assign <- rep("D", nrow(dat))
for(i in seq_along(l1)){
df <- dat[l1[[i]]]
i1 <- duplicated(df)|duplicated(df, fromLast = TRUE)
dat$Assign <- replace(dat$Assign, i1, LETTERS[i])
}
-output
dat
# Student_ID Number Position Assign
#1 VB-123 10 2 B
#2 VB-456 15 5 A
#3 VB-789 25 25 C
#4 VB-889 12 2 B
#5 VB-965 15 7 A
#6 VB-758 45 9 D
#7 VB-245 25 25 C

A solution using dplyr.
library(dplyr)
dat2 <- dat %>% count(Number)
dat3 <- dat %>% count(Position)
dat4 <- dat %>% count(Number, Position)
dat5 <- dat %>%
left_join(dat2, by = "Number") %>%
left_join(dat3, by = "Position") %>%
left_join(dat4, by = c("Number", "Position")) %>%
mutate(Assign = case_when(
n > 1 ~ "C",
n.x > 1 & n.y == 1 ~ "A",
n.y > 1 & n.x == 1 ~ "B",
TRUE ~ "D"
)) %>%
select(-n.x, -n.y, -n)
dat5
# Student_ID Number Position Assign
# 1 VB-123 10 2 B
# 2 VB-456 15 5 A
# 3 VB-789 25 25 C
# 4 VB-889 12 2 B
# 5 VB-965 15 7 A
# 6 VB-758 45 9 D
# 7 VB-245 25 25 C
DATA
dat <- read.table(text = "Student_ID Number Position
'VB-123' 10 2
'VB-456' 15 5
'VB-789' 25 25
'VB-889' 12 2
'VB-965' 15 7
'VB-758' 45 9
'VB-245' 25 25",
header = TRUE, stringsAsFactors = FALSE)

Related

Simultaneously remove the first and last rows of a data frame until reaching a row that does not have an NA

I have a dataframe that contains NA values, and I want to remove some rows that have an NA (i.e., not complete cases). However, I only want to remove rows at the beginning and ending of the dataframe. So, I want to keep any rows that have an NA that are not in the first or last rows of the dataframe. What is the most efficient way to simultaneously remove these rows with NAs without using a row index? This is related to my previous question, but I also want to remove the first rows at the same time. There are other posts that also focus on removing only the first rows, but not both.
Data
df <- structure(list(var1 = 1:15,
var2 = c(3, NA, 3, NA, 2, NA, 3, 4, 2, NA, 4, 2, 45, 2, 1),
var3 = c(6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, NA, NA, NA, NA),
var4 = c(NA, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, NA)),
class = "data.frame", row.names = c(NA, -15L))
Expected Output
So, in this example, I removed rows 1 to 2, and 12 to 15 since they have an NA and row 3 and 11 does not have an NA.
var1 var2 var3 var4
1 3 3 8 8
2 4 NA 9 9
3 5 2 10 10
4 6 NA 11 11
5 7 3 12 12
6 8 4 13 13
7 9 2 14 14
8 10 NA 15 15
9 11 4 16 16
I know that I could have 2 statements in filter to remove the top and bottom rows (shown below). But I'm wondering if there is a more efficient way to do this with really large datasets (open to any method tidyverse, base R, data.table, etc.).
library(dplyr)
df %>%
filter(cumsum(complete.cases(.)) != 0 &
rev(cumsum(rev(complete.cases(.)))) != 0)
base R
r <- rle(complete.cases(df))
str(r, vec.len = 9)
# List of 2
# $ lengths: int [1:9] 2 1 1 1 1 3 1 1 4
# $ values : logi [1:9] FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
# - attr(*, "class")= chr "rle"
r$values[ -c(1, length(r$values)) ] <- TRUE
str(r, vec.len = 9)
# List of 2
# $ lengths: int [1:9] 2 1 1 1 1 3 1 1 4
# $ values : logi [1:9] FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
# - attr(*, "class")= chr "rle"
df[inverse.rle(r),]
# var1 var2 var3 var4
# 3 3 3 8 8
# 4 4 NA 9 9
# 5 5 2 10 10
# 6 6 NA 11 11
# 7 7 3 12 12
# 8 8 4 13 13
# 9 9 2 14 14
# 10 10 NA 15 15
# 11 11 4 16 16
dplyr
For your question of efficiency, you can adapt the rle solution to dplyr as well (that should be trivial), but I see no reason why the use of complete.cases and cumany/rev would be a problem. You can improve on your attempt by not calculating complete.cases(.) twice as you're doing, storing it in an interim column.
library(dplyr)
df %>%
mutate(aux = complete.cases(cur_data())) %>%
filter(cumany(aux) & rev(cumany(rev(aux))))
# var1 var2 var3 var4 aux
# 1 3 3 8 8 TRUE
# 2 4 NA 9 9 FALSE
# 3 5 2 10 10 TRUE
# 4 6 NA 11 11 FALSE
# 5 7 3 12 12 TRUE
# 6 8 4 13 13 TRUE
# 7 9 2 14 14 TRUE
# 8 10 NA 15 15 FALSE
# 9 11 4 16 16 TRUE
data.table
(Just an adaptation of the dplyr version.)
library(data.table)
setDT(df)
df[, aux := complete.cases(.SD)
][ cumsum(aux) > 0 & rev(cumsum(rev(aux)) > 0), ]
# var1 var2 var3 var4 aux
# <int> <num> <int> <int> <lgcl>
# 1: 3 3 8 8 TRUE
# 2: 4 NA 9 9 FALSE
# 3: 5 2 10 10 TRUE
# 4: 6 NA 11 11 FALSE
# 5: 7 3 12 12 TRUE
# 6: 8 4 13 13 TRUE
# 7: 9 2 14 14 TRUE
# 8: 10 NA 15 15 FALSE
# 9: 11 4 16 16 TRUE
I would do
na_count <- rowSums(is.na(df))
df <- df %>%
slice(min(which(na_count==0)):max(which(na_count==0)))
Output
> df
var1 var2 var3 var4
1 3 3 8 8
2 4 NA 9 9
3 5 2 10 10
4 6 NA 11 11
5 7 3 12 12
6 8 4 13 13
7 9 2 14 14
8 10 NA 15 15
9 11 4 16 16
I think we overcomplicated it a bit, most efficient I think is just plain base R
Directly take all your complete cases
s <- which(complete.cases(df))
We surely cannot subset on s, as we want to keep all the "in between" incomplete ones too, we can achieve that by simply subset from the first up till the last index.
df[first(s):last(s), ]
continuing a rle love fest:
(which(rle(rowSums(df_NA))$values != 'NA')[1]):dplyr::last(which(rle(rowSums(df_NA))$values != 'NA'))
[1] 3 4 5 6 7 8 9 10 11
or, dispensing with dplyr
(which(rle(rowSums(df_NA))$values != 'NA')[1]):(which(rle(rowSums(df_NA))$values != 'NA'))[[(length(which(rle(rowSums(df_NA))$values != 'NA')))]]
[1] 3 4 5 6 7 8 9 10 11
Another possible solution (thanks, #r2evans, for suggesting complete.cases):
library(dplyr)
df %>%
mutate(aux = !complete.cases(.)) %>%
filter(!cumall(aux)) %>%
arrange(desc(var1)) %>%
filter(!cumall(aux)) %>%
arrange(var1) %>%
select(-aux)
#> var1 var2 var3 var4
#> 1 3 3 8 8
#> 2 4 NA 9 9
#> 3 5 2 10 10
#> 4 6 NA 11 11
#> 5 7 3 12 12
#> 6 8 4 13 13
#> 7 9 2 14 14
#> 8 10 NA 15 15
#> 9 11 4 16 16
Bit late to the party, but base R in a single expression:
df[Reduce(
function(x, y){
seq(from = x, to = y)
},
range(
which(
complete.cases(df)
)
)
), ]
Benchmark
Here, I create a bigger dataset with 1,000,000 million rows of 3 variables to determine which method is the fastest. *Note: It will take a few seconds to apply the NA values randomly to the 3 columns for the first 100,000 rows and the last 100,000 rows. Essentially, with this example, we want to remove the first 100,000 rows and the last 100,000 rows.
Dataset
set.seed(203)
df <- data.frame(var1 = sample(x = 1:500, size = 1000000, replace = TRUE),
var2 = sample(x = 1:500, size = 1000000, replace = TRUE),
var3 = sample(x = 1:500, size = 1000000, replace = TRUE))
df[1:100000,] <- plyr::ddply(df[1:100000,], .(var1, var2, var3), function(x) {x[sample(x = 1:3, size = 1, replace = TRUE)] <- NA;x})
df[900000:1000000,] <- plyr::ddply(df[900000:1000000,], .(var1, var2, var3), function(x) {x[sample(x = 1:3, size = 1, replace = TRUE)] <- NA;x})
df[300000:400000,2] <- NA
Output
It looks like #MerijnvanTilborg data.table solution is the fastest, followed by #r2evans data.table version on this sample dataset.
Code
library(tidyverse)
library(data.table)
df1 <- df
dt1 <- as.data.table(df)
dt2 <- as.data.table(df)
bm <- microbenchmark::microbenchmark(baseR_r2evans = {r <- rle(complete.cases(df1));
r$values[ -c(1, length(r$values)) ] <- TRUE; df[inverse.rle(r),]},
dplyr_r2evans = {df %>%
dplyr::mutate(aux = complete.cases(cur_data())) %>%
dplyr::filter(cumany(aux) & rev(cumany(rev(aux))))},
datatable_r2evans = {dt1[, aux := complete.cases(.SD)
][ cumsum(aux) > 0 & rev(cumsum(rev(aux)) > 0), ]},
valkyr = {na_count <- rowSums(is.na(df)); df %>%
dplyr::slice(min(which(na_count==0)):max(which(na_count==0)))},
PaulS = {df %>%
dplyr::mutate(aux = !complete.cases(.)) %>%
dplyr::filter(!cumall(aux)) %>%
dplyr::arrange(desc(var1)) %>%
dplyr::filter(!cumall(aux)) %>%
dplyr::arrange(var1) %>%
dplyr::select(-aux)},
Chris = {df[(which(rle(rowSums(df))$values != 'NA')[1]):(which(rle(rowSums(df))$values != 'NA'))[[(length(which(rle(rowSums(df))$values != 'NA')))]],]},
AndrewGB = {df %>%
dplyr::filter(cumsum(complete.cases(.)) != 0 &
rev(cumsum(rev(complete.cases(.)))) != 0)},
Merijn_baseR = {s <- which(complete.cases(df));
df[first(s):last(s), ]},
Merijn_datatable = {dt2[, aux := complete.cases(.SD)][first(which(aux)):last(which(aux))]},
times = 1000
)

Dynamic Columns in Dplyr using NSE on the RHS

I am attempting to reference existing columns in dplyr through a loop. Effectively, I would like to evaluate the operations from one table (evaluation in below example) to be performed to another table (dt in below example). I do not want to hardcode the column names on the RHS within mutate(). I would like to control the evaluations being performed from the evaluation table below. So I am trying to make the process dynamic.
Here is a sample dataframe:
dt = data.frame(
A = c(1:20),
B = c(11:30),
C = c(21:40),
AA = rep(1, 20),
BB = rep(2, 20)
)
Here is a table of sample operations to be performed:
evaluation = data.frame(
New_Var = c("AA", "BB"),
Operation = c("(A*2) > B", "(B*2) <= C"),
Result = c("True", "False")
) %>% mutate_all(as.character)
What I am trying to do is the following:
for (i in 1:nrow(evaluation)) {
var = evaluation$New_Var[i]
dt = dt %>%
rowwise() %>%
mutate(!!var := ifelse(eval(parse(text = evaluation$Operation[i])),
evaluation$Result[i],
!!var))
}
my desired result would be something like this except for the "AA" in the AA column would be the original numeric values of the AA column of 1, 1, 1, 1, 1.
UPDATED:
I believe my syntax in the "False" part of the ifelse statement is incorrect. What is the correct syntax to specify "!!var" in the false portion of the ifelse statement?
I know there are other ways to do it using base R, but I would rather do it through dplyr as it is cleaner code to look at. I am leveraging "rowise()" to do it element by element.
Modified data to (a) enforce type consistency for columns AA and BB and (b) ensure that at least one row satisfies the second condition.
dt = tibble(
A = c(1:20),
B = c(10:29), ## Note the change
C = c(21:40),
AA = rep("a", 20), ## Note initialization with strings
BB = rep("b", 20) ## Ditto
)
To make your loop work, you need to convert your code strings into actual expressions. You can use rlang::sym() for variable names and rlang::parse_expr() for everything else.
for( i in 1:nrow(evaluation) )
{
var <- rlang::sym(evaluation$New_Var[i])
op <- rlang::parse_expr(evaluation$Operation[i])
dt = dt %>% rowwise() %>%
mutate(!!var := ifelse(!!op, evaluation$Result[i],!!var))
}
# # A tibble: 20 x 5
# A B C AA BB
# <int> <int> <int> <chr> <chr>
# 1 1 10 21 a False
# 2 2 11 22 a False
# 3 3 12 23 a b
# 4 4 13 24 a b
# 5 5 14 25 a b
# 6 6 15 26 a b
# 7 7 16 27 a b
# 8 8 17 28 a b
# 9 9 18 29 a b
# 10 10 19 30 True b
# 11 11 20 31 True b
# 12 12 21 32 True b
# 13 13 22 33 True b
# 14 14 23 34 True b
# 15 15 24 35 True b
# 16 16 25 36 True b
# 17 17 26 37 True b
# 18 18 27 38 True b
# 19 19 28 39 True b
# 20 20 29 40 True b
Assuming that Felipe's answer was the functionality you desired, here's a more "tidyverse"/pipe-oriented/functional approach.
Data
library(rlang)
library(dplyr)
library(purrr)
operations <- tibble(
old_var = exprs(A, B),
new_var = exprs(AA, BB),
test = exprs(2*A > B, 2*B <= C),
result = exprs("True", "False")
)
original <- tibble(
A = sample.int(30, 10),
B = sample.int(30, 10),
C = sample.int(30, 10)
)
original
# A tibble: 10 x 3
A B C
<int> <int> <int>
1 4 20 5
2 30 29 11
3 1 27 14
4 2 21 4
5 17 19 24
6 14 25 9
7 5 22 22
8 6 13 7
9 25 4 21
10 12 11 12
Functions
# Here's your reusable functions
generic_mutate <- function(dat, new_var, test, result, old_var) {
dat %>% mutate(!!new_var := ifelse(!!test, !!result, !!old_var))
}
generic_ops <- function(dat, ops) {
pmap(ops, generic_mutate, dat = dat) %>%
reduce(full_join)
}
generic_mutate takes a single original dataframe, a single new_var, etc. It performs the test, adds the new column with the appropriate name and values.
generic_ops is the "vectorized" version. It takes the original dataframe as the first argument, and a dataframe of operations as the second. It then parallel maps over each column of new variable names, tests, etc, and calls generic_mutate on each one. That results in a list of dataframes, each with one added column. The reduce then combines them back all together with a sequential full_join.
Results
original %>%
generic_ops(operations)
Joining, by = c("A", "B", "C")
# A tibble: 10 x 5
A B C AA BB
<int> <int> <int> <chr> <chr>
1 4 20 5 4 20
2 30 29 11 True 29
3 1 27 14 1 27
4 2 21 4 2 21
5 17 19 24 True 19
6 14 25 9 True 25
7 5 22 22 5 22
8 6 13 7 6 13
9 25 4 21 True False
10 12 11 12 True 11
The magic here is using exprs(...) so you can store NSE names and operations in a tibble without forcing their evaluation. I think this is a lot cleaner than storing names and operations in strings with quotation marks.
How's this:
evaluation = data.frame(
Old_Var = c('A', 'B'),
New_Var = c("AA", "BB"),
Operation = c("(A*2) > B", "(B*2) <= C"),
Result = c("True", "False")
) %>% mutate_all(as.character)
for (i in 1:nrow(evaluation)) {
old <- sym(evaluation$Old_Var[i])
new <- sym(evaluation$New_Var[i])
op <- sym(evaluation$Operation[i])
res <- sym(evaluation$Result[i])
dt <- dt %>%
mutate(!!new := ifelse(!!op, !!res, !!old))
}
EDIT: My last answer doesn't work because rlang tries to find a variable named !!op (e.g. named (A*2) > B) instead of evaluating the expression. I got this to work using a mix of tidyselect and base R. You can of course follow #Brian's advice and use this solution with pmap. I honestly don't know how well this will perform though, as I think it will evaluate the ifelse once per row, and am not sure it's a vectorized operation...
dt <- tibble(
A = c(1:20),
B = c(11:30),
C = c(21:40),
AA = rep(1, 20),
BB = rep(2, 20)
)
evaluation = tibble(
Old_Var = c('A', 'B'),
New_Var = c("AA", "BB"),
Operation = c('(A*2) > B', '(B*2) <= C'),
Result = c("True", "False")
)
for (i in 1:nrow(evaluation)) {
old <- evaluation$Old_Var[i]
new <- evaluation$New_Var[i]
op <- evaluation$Operation[i]
res <- evaluation$Result[i]
dt <- dt %>%
mutate(!!sym(new) := eval(parse(text = sprintf('ifelse(%s, "%s", %s)', op, res, old))))
}
One way is to rework the conditions first, then pass them to mutate :
conds <- parse(text=evaluation$Operation) %>%
as.list() %>%
setNames(evaluation$New_Var) %>%
imap(~expr(ifelse(!!.,"True", !!sym(.y))))
conds
#> $AA
#> ifelse((A * 2) > B, "True", AA)
#>
#> $BB
#> ifelse((B * 2) <= C, "True", BB)
dt %>% mutate(!!!conds)
#> A B C AA BB
#> 1 1 11 21 1 2
#> 2 2 12 22 1 2
#> 3 3 13 23 1 2
#> 4 4 14 24 1 2
#> 5 5 15 25 1 2
#> 6 6 16 26 1 2
#> 7 7 17 27 1 2
#> 8 8 18 28 1 2
#> 9 9 19 29 1 2
#> 10 10 20 30 1 2
#> 11 11 21 31 True 2
#> 12 12 22 32 True 2
#> 13 13 23 33 True 2
#> 14 14 24 34 True 2
#> 15 15 25 35 True 2
#> 16 16 26 36 True 2
#> 17 17 27 37 True 2
#> 18 18 28 38 True 2
#> 19 19 29 39 True 2
#> 20 20 30 40 True 2

Summarize values by group, but keep original data

I am trying to figure out how to sum values belonging to category a and b by factor file, but also keep the original data.
library(dplyr)
df <- data.frame(ID = 1:20, values = runif(20), category = rep(letters[1:5], 4), file = as.factor(sort(rep(1:5, 4))))
ID values category file
1 1 0.65699229 a 1
2 2 0.70506478 b 1
3 3 0.45774178 c 1
4 4 0.71911225 d 1
5 5 0.93467225 e 1
6 6 0.25542882 a 2
7 7 0.46229282 b 2
8 8 0.94001452 c 2
9 9 0.97822643 d 2
10 10 0.11748736 e 2
11 11 0.47499708 a 3
12 12 0.56033275 b 3
13 13 0.90403139 c 3
14 14 0.13871017 d 3
15 15 0.98889173 e 3
16 16 0.94666823 a 4
17 17 0.08243756 b 4
18 18 0.51421178 c 4
19 19 0.39020347 d 4
20 20 0.90573813 e 4
so that
df[1,2] will be added to df[2,2] to category 'ab' for file 1
df[6,2] will be added to df[7,2] to category 'ab' for file 2
etc.
So far I have this:
df %>%
filter(category %in% c('a' , 'b')) %>%
group_by(file) %>%
summarise(values = sum(values))
Problem
I would like to change the category of the summed values to "ab" and append it to the original data frame in the same pipeline.
Desired output:
ID values category file
1 1 0.65699229 a 1
2 2 0.70506478 b 1
3 3 0.45774178 c 1
4 4 0.71911225 d 1
5 5 0.93467225 e 1
6 6 0.25542882 a 2
7 7 0.46229282 b 2
8 8 0.94001452 c 2
9 9 0.97822643 d 2
10 10 0.11748736 e 2
11 11 0.47499708 a 3
12 12 0.56033275 b 3
13 13 0.90403139 c 3
14 14 0.13871017 d 3
15 15 0.98889173 e 3
16 16 0.94666823 a 4
17 17 0.08243756 b 4
18 18 0.51421178 c 4
19 19 0.39020347 d 4
20 20 0.90573813 e 4
21 21 1.25486225 ab 1
22 22 1.87216325 ab 2
23 23 1.36548126 ab 3
This will get you the result
df %>% bind_rows(
df %>%
filter(category %in% c('a' , 'b')) %>%
group_by(file) %>%
mutate(values = sum(values), category = paste0(category,collapse='')) %>%
filter(row_number() == 1 & n() > 1)
) %>% mutate(ID = row_number())
BTW the code pro produce the dataframe in the example is this one:
df <- data.frame(ID = 1:20, values = runif(20), category = rep(letters[1:5], 4), file = as.factor(sort(rep(1:4, 5))))
now lets say you want to sum multiple columns, you need to provide the list in a vector:
cols = c("values") # columns to be sum
df %>% bind_rows(
df %>%
filter(category %in% c('a' , 'b')) %>%
group_by(file) %>%
mutate_at(vars(cols), sum) %>%
mutate(category = paste0(category,collapse='')) %>%
filter(row_number() == 1 & n() > 1)
) %>% mutate(ID = row_number())
library(dplyr)
df1 %>%
filter(category %in% c('a' , 'b')) %>%
group_by(file) %>%
filter(n_distinct(category) > 1) %>%
summarise(values = sum(values)) %>%
mutate(category="ab",
ID=max(df1$ID)+1:n()) %>%
bind_rows(df1, .)
#> Warning in bind_rows_(x, .id): binding factor and character vector,
#> coercing into character vector
#> Warning in bind_rows_(x, .id): binding character and factor vector,
#> coercing into character vector
#> ID values category file
#> 1 1 0.62585921 a 1
#> 2 2 0.61865851 b 1
#> 3 3 0.05274456 c 1
#> 4 4 0.68156961 d 1
.
.
.
#> 19 19 0.43239411 d 5
#> 20 20 0.85886314 e 5
#> 21 21 1.24451773 ab 1
#> 22 22 0.99001810 ab 2
#> 23 23 1.25331943 ab 3
This data.table approach uses a self-join to get all of the possible two-character combinations.
library(data.table)
setDT(df)
df_self_join <- df[df, on = .(file), allow.cartesian = T
][category != i.category,
.(category = paste0(i.category, category), values = values + i.values, file)
][order(category), .(ID = .I + nrow(df), values, category, file)]
rbindlist(list(df, df_self_join))
ID values category file
1: 1 0.76984382 a 1
2: 2 0.54311583 b 1
3: 3 0.23462016 c 1
4: 4 0.60179043 d 1
...
20: 20 0.03534223 e 5
21: 21 1.31295965 ab 1
22: 22 0.51666175 ab 2
23: 23 1.02305754 ab 3
24: 24 1.00446399 ac 1
25: 25 0.96910373 ac 2
26: 26 0.87795389 ac 4
#total of 80 rows
Here is pretty close dplyr translation:
library(dplyr)
tib <- as_tibble(df)
inner_join(tib, tib, by = 'file')%>%
filter(ID.x != ID.y)%>%
transmute(category = paste0(category.x, category.y)
, values = values.x + values.y
, file)%>%
arrange(category)%>%
bind_rows(tib, .)%>%
mutate(ID = row_number())%>%
filter(category == 'ab') #filter added to show the "ab" files
# A tibble: 3 x 4
ID values category file
<int> <dbl> <chr> <fct>
1 21 1.31 ab 1
2 22 0.517 ab 2
3 23 1.02 ab 3

Subset dataframe based on Values in second dataframe

I have one dataframe, df, that has two columns as such:
> head(df1[,c(10,11)])
ColA ColB
1 12 20
2 7 5
3 32 38
4 37 46
5 15 15
6 4 4
I have a second dataframe, also with 2 columns with matching names. Instead, there are only two numbers, as such:
> head(df2)
ColA ColB
1 50 30
I want to subset values from df1 based on the value in the corresponding column from df2 . Doing this manually would look like this:
colA_vector <- df1[df1$colA < 50,]
colB_vector <- df1[df1$ColB < 30,]
How can I do so in a more general purpose way? I do not want to hardcode anything. The column name "ColA" or "ColB" could be anything (so solutions requiring those column names won't really work).
Thank you.
In base R we could do:
nms <- intersect(names(df1), names(df2))
df1[do.call(`&`, Map(`<`, df1[nms], df2[nms])),]
# ColA ColB
# 1 12 20
# 2 7 5
# 5 15 15
# 6 4 4
Or just df1[do.call('&', Map('<', df1, df2)),] if both data.frames have the same order of columns and same names.
Using the package fuzzyjoin might be more readable however:
library(fuzzy_join)
fuzzy_semi_join(df1, df2, match_fun = `<`)
# ColA ColB
# 1 12 20
# 2 7 5
# 5 15 15
# 6 4 4
data
df1 <- read.table(text="
ColA ColB
1 12 20
2 7 5
3 32 38
4 37 46
5 15 15
6 4 4",h=T,strin=F)
df2 <- read.table(text="ColA ColB
1 50 30",h=T,strin=F)
Create a function if we want to do the same task repeatedly
f1 <- function(dat1, dat2, colName) {
dat1[dat1[[colName]] < dat2[[colName]],]
}
f1(df1, df2, "ColA")
# ColA ColB
#1 12 20
#2 7 5
#3 32 38
#4 37 46
#5 15 15
#6 4 4
f1(df1, df2, "ColB")
# ColA ColB
#1 12 20
#2 7 5
#5 15 15
#6 4 4
data
df1 <- structure(list(ColA = c(12L, 7L, 32L, 37L, 15L, 4L), ColB = c(20L,
5L, 38L, 46L, 15L, 4L)), class = "data.frame", row.names = c(NA,
-6L))
df2 <- structure(list(ColA = 50L, ColB = 30L),
class = "data.frame", row.names = "1")
Using dplyr:
df1 %>%
filter(df1[,1] < df2[,1])
ColA ColB
1 12 20
2 7 5
3 32 38
4 37 46
5 15 15
6 4 4
df1 %>%
filter(df1[,2] < df2[,2])
ColA ColB
1 12 20
2 7 5
3 15 15
4 4 4
Subsetting based on both columns simultaneously:
df1 %>%
filter(df1[,1] < df2[,1] & df1[,2] < df2[,2])
ColA ColB
1 12 20
2 7 5
3 15 15
4 4 4
If you don't want to use the fuzzyjoin join package or make your own function, you can just repeat the second dataframe.
df1 <- data.frame("ColA" = c(12, 7, 32),
"ColB" = c(20, 5, 38))
df2 <- data.frame("ColA" = 50,
"ColB" = 30)
n <- nrow(df1)
df2_new <- do.call("rbind", replicate(n, df2, simplify = FALSE))
df1_which <- as.data.frame(df1 < df2_new)
colA_vector <- df1[df1_which$ColA, "ColA"]
colB_vector <- df1[df1_which$ColB, "ColB"]
You can try a tidyverse funtion. Result is a list of the filtered data.frames.
foo <- function(x, y, ColA, ColB){
require(tidyverse)
var1 <- quo_name(ColA)
var2 <- quo_name(ColB)
x %>%
select(a=!!var1, b=!!var2) %>%
mutate(colA_vector= a < y[[ColA]]) %>%
mutate(colB_vector= b < y[[ColB]]) %>%
gather(k, v, -a, -b) %>%
filter(v) %>%
split(.$k) %>%
map(~select(.,-v,-k))
}
foo(df1, df2, "ColA", "ColB")
$colA_vector
a b
1 12 20
2 7 5
3 32 38
4 37 46
5 15 15
6 4 4
$colB_vector
a b
7 12 20
8 7 5
9 15 15
10 4 4

Aggregate dataframe in rolling blocks of 3 rows

I have the following data frame as an example
df <- data.frame(score=letters[1:15], total1=1:15, total2=16:30)
> df
score total1 total2
1 a 1 16
2 b 2 17
3 c 3 18
4 d 4 19
5 e 5 20
6 f 6 21
7 g 7 22
8 h 8 23
9 i 9 24
10 j 10 25
11 k 11 26
12 l 12 27
13 m 13 28
14 n 14 29
15 o 15 30
I would like to aggregate my data frame by sum by grouping the rows having different name, i.e.
groups sum1 sum2
'a-b-c' 6 51
'c-d-e' 21 60
etc
All the given answers to this kind of question assume that the strings repeat in the row.
The usual aggregate function that I use to obtain the summary delivers a different result:
aggregate(df$total1, by=list(sum1=df$score %in% c('a','b','c'), sum2=df$score %in% c('d','e','f')), FUN=sum)
sum1 sum2 x
1 FALSE FALSE 99
2 TRUE FALSE 6
3 FALSE TRUE 15
If you want a tidyverse solution, here is one possibility:
df <- data.frame(score=letters[1:15], total1=1:15, total2=16:30)
df %>%
mutate(groups = case_when(
score %in% c("a","b","c") ~ "a-b-c",
score %in% c("d","e","f") ~ "d-e-f"
)) %>%
group_by(groups) %>%
summarise_if(is.numeric, sum)
returns
# A tibble: 3 x 3
groups total1 total2
<chr> <int> <int>
1 a-b-c 6 51
2 d-e-f 15 60
3 <NA> 99 234
Add a "groups" column with the category value.
df$groups = NA
and then define each group like this:
df$groups[df$score=="a" | df$score=="b" | df$score=="c" ] = "a-b-c"
Finally aggregate by that column.
Here's a solution that works for any sized data frame.
df <- data.frame(score=letters[1:15], total1=1:15, total2=16:30)
# I'm adding a row to demonstrate that the grouping pattern works when the
# number of rows is not equally divisible by 3.
df <- rbind(df, data.frame(score = letters[16], total1 = 16, total2 = 31))
# A vector that represents the correct groupings for the data frame.
groups <- c(rep(1:floor(nrow(df) / 3), each = 3),
rep(floor(nrow(df) / 3) + 1, nrow(df) - length(1:(nrow(df) / 3)) * 3))
# Your method of aggregation by `groups`. I'm going to use `data.table`.
require(data.table)
dt <- as.data.table(df)
dt[, group := groups]
aggDT <- dt[, list(score = paste0(score, collapse = "-"),
total1 = sum(total1), total2 = sum(total2)), by = group][
, group := NULL]
aggDT
score total1 total2
1: a-b-c 6 51
2: d-e-f 15 60
3: g-h-i 24 69
4: j-k-l 33 78
5: m-n-o 42 87
6: p 16 31

Resources