Conditionally count rows based on comparing values between column - r

I have the following data frame:
df <- structure(list(x = c(0.389794300700167, -1.20807617542949, -0.363676017470862,
-1.62667268170309, -0.256478394123992, 1.10177950308713, 0.755781508027337,
-0.238233556018718, 0.98744470341339, 0.741390128383824), y = c(0.0893472664958216,
-0.954943856152377, -0.195150384667239, 0.92552126209408, 0.482978524836611,
-0.596310636720207, -2.18528683816953, -0.674865937875116, -2.11906119191017,
-1.2651980215309), fac = structure(c(2L, 1L, 2L, 3L, 1L, 1L,
1L, 1L, 2L, 2L), .Label = c("A", "B", "C"), class = "factor")), .Names = c("x",
"y", "fac"), row.names = c(NA, -10L), class = "data.frame")
df
#> x y fac manual_assignment
#> 1 0.3897943 0.08934727 B b.x
#> 2 -1.2080762 -0.95494386 A a.y
#> 3 -0.3636760 -0.19515038 B b.y
#> 4 -1.6266727 0.92552126 C c.y
#> 5 -0.2564784 0.48297852 A a.y
#> 6 1.1017795 -0.59631064 A a.x
#> 7 0.7557815 -2.18528684 A a.x
#> 8 -0.2382336 -0.67486594 A a.x
#> 9 0.9874447 -2.11906119 B b.x
#> 10 0.7413901 -1.26519802 B b.x
What I want to do is to count rows based on comparing values in x and y. For every row, if the value x is larger than y, we increase the count of fac member by 1. So the end result is this:
x.count y.count
A 3 2 (# a.y)
B 3 1
C 0 1
How can I achieve that? Possible with dplyr?

This is more straight forward with table:
with(df, table(fac, ifelse(x > y, "x.count", "y.count")))
#fac x.count y.count
# A 3 2
# B 3 1
# C 0 1
With dplyr/tidyr, you need a few more lines of code:
library(tidyverse)
df %>%
group_by(fac, measure = if_else(x > y, "x.count", "y.count")) %>%
tally() %>%
spread(measure, n, fill = 0)
#Source: local data frame [3 x 3]
#Groups: fac [3]
# fac x.count y.count
#* <fctr> <dbl> <dbl>
#1 A 3 2
#2 B 3 1
#3 C 0 1

With dplyr we can group by fac and count the length of values where x and y values are greater.
library(dplyr)
df %>%
group_by(fac) %>%
summarise(x.count = length(which(x > y)),
y.count = length(which(x < y)))
# fac x.count y.count
# <fctr> <int> <int>
#1 A 3 2
#2 B 3 1
#3 C 0 1

With data.table, we can use
library(data.table)
dcast(setDT(df)[, .N, .(fac, measure = c('y.count', 'x.count')[(x > y) + 1])],
fac ~measure, fill = 0)
# fac x.count y.count
#1: A 3 2
#2: B 3 1
#3: C 0 1

Related

Counting occurencies in every column in R

Hello I need to count the occurencies of every number in each column.
Example data-frame:
A B C
2 1 2
2 1 1
1 1 3
3 3 3
3 2 2
2 1 2
I want my output to look like this
how_much A B C
1 1 4 1
2 3 1 3
3 2 1 2
In tidyverse you could do:
library(tidyverse)
gather(df1) %>%
group_by(key,value) %>%
count() %>%
pivot_wider(value, names_from = key, values_from = n, values_fill = 0)
value A B C
<int> <int> <int> <int>
1 1 1 4 1
2 2 3 1 3
3 3 2 1 2
We can use table
table(unlist(df1), names(df1)[c(col(df1))])
-output
A B C
1 1 4 1
2 3 1 3
3 2 1 2
Or loop over the columns with sapply, and apply table
sapply(df1, table)
A B C
1 1 4 1
2 3 1 3
3 2 1 2
data
df1 <- structure(list(A = c(2L, 2L, 1L, 3L, 3L, 2L), B = c(1L, 1L, 1L,
3L, 2L, 1L), C = c(2L, 1L, 3L, 3L, 2L, 2L)),
class = "data.frame", row.names = c(NA,
-6L))
In order for the solution to be more flexible and can be used for any occurrence of numbers we can use the following solution using purrr package functions.
library(dplyr)
library(purrr)
df1 %>%
map(~ unique(.x) %>% sort()) %>% reduce(~ union(..1, ..2)) %>%
bind_cols(map_dfr(., ~ map_dfc(df1, function(a) sum(a == .x)))) %>%
rename(what = ...1)
# A tibble: 3 x 4
what A B C
<int> <int> <int> <int>
1 1 1 4 1
2 2 3 1 3
3 3 2 1 2
A slightly verbose answer, but it will work on all data types.
set.seed(1234)
df1 <- data.frame(A = sample(letters[1:3], 8, T),
B = sample(letters[1:3], 8, T),
C = sample(letters[1:3], 8, T))
df1
#> A B C
#> 1 b c b
#> 2 b b a
#> 3 a b c
#> 4 c b c
#> 5 a c c
#> 6 a b a
#> 7 b b b
#> 8 b b a
library(tidyverse)
unique(unlist(apply(df1, 1, unique))) %>% as.data.frame() %>% setNames('how_much') %>%
bind_cols(map_df(unique(unlist(apply(df1, 1, unique))), ~map_int(df1, \(x) sum(x %in% .x) ) ))
#> how_much A B C
#> 1 b 4 6 2
#> 2 c 1 2 3
#> 3 a 3 0 3
Created on 2021-06-23 by the reprex package (v2.0.0)

Count number of times an account_ID is shared between groups in a dataframe in R

I have a dataframe in R that has a large number of bank_account_IDs and Vendor_Codes. Bank_account_IDs should not be shared between Vendor_Codes, but sometimes a fraudulent vendor exists that shares another vendor's bank_account_ID.
I want to add a new field to the dataframe that provides a count for the number of times an account_ID exists with more than 1 Vendor_Code.
My sample dataframe is as follows:
bank_account_ID = c(a, b, c, a, a, d, e, f, b, c)
Vendor_Code = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
df <-data.frame(a,b)
My ideal new dataframe should look something like this:
bank_account_ID Vendor_Code duplicate_count
a 1 2
b 2 1
c 3 1
a 4 2
a 5 2
d 6 0
e 7 0
f 8 0
b 9 1
c 10 1
Thanks in advance!
We can get the number of distinct elements with n_distinct grouped by the 'bank_account_ID' and subtract 1
library(dplyr)
df %>%
group_by(bank_account_ID) %>%
mutate(dupe_count = n_distinct(Vendor_Code)-1) %>%
ungroup
-output
# A tibble: 10 x 4
# bank_account_ID Vendor_Code duplicate_count dupe_count
# <chr> <int> <int> <dbl>
# 1 a 1 2 2
# 2 b 2 1 1
# 3 c 3 1 1
# 4 a 4 2 2
# 5 a 5 2 2
# 6 d 6 0 0
# 7 e 7 0 0
# 8 f 8 0 0
# 9 b 9 1 1
#10 c 10 1 1
data
df <- structure(list(bank_account_ID = c("a", "b", "c", "a", "a", "d",
"e", "f", "b", "c"), Vendor_Code = 1:10, duplicate_count = c(2L,
1L, 1L, 2L, 2L, 0L, 0L, 0L, 1L, 1L)), class = "data.frame", row.names = c(NA,
-10L))

How to use column indices to collect values from columns in R

x y z column_indices
6 7 1 1,2
5 4 2 3
1 3 2 1,3
I have the column indices of the values I would like to collect in a separate column like so, what I want to create is something like this:
x y z column_indices values
6 7 1 1,2 6,7
5 4 2 3 2
1 3 2 1,3 1,2
What is the simplest way to do this in R?
Thanks!
In base R, we can use apply, split the column_indices on ',', convert them to integer and get the corresponding value from the row.
df$values <- apply(df, 1, function(x) {
inds <- as.integer(strsplit(x[4], ',')[[1]])
toString(x[inds])
})
df
# x y z column_indices values
#1 6 7 1 1,2 6, 7
#2 5 4 2 3 2
#3 1 3 2 1,3 1, 2
data
df <- structure(list(x = c(6L, 5L, 1L), y = c(7L, 4L, 3L), z = c(1L,
2L, 2L), column_indices = structure(c(1L, 3L, 2L), .Label = c("1,2",
"1,3", "3"), class = "factor")), class = "data.frame", row.names = c(NA, -3L))
One solution involving dplyr and tidyr could be:
df %>%
pivot_longer(-column_indices) %>%
group_by(column_indices) %>%
mutate(values = toString(value[1:n() %in% unlist(strsplit(column_indices, ","))])) %>%
pivot_wider(names_from = "name", values_from = "value")
column_indices values x y z
<chr> <chr> <int> <int> <int>
1 1,2 6, 7 6 7 1
2 3 2 5 4 2
3 1,3 1, 2 1 3 2

How to sum df when it contains characters?

I am trying to prep my data and I am stuck with one issue. Lets say I have the following data frame:
df1
Name C1 Val1
A a x1
A a x2
A b x3
A c x4
B d x5
B d x6
...
and I want to narrow down the df to
df2
Name C1 Val
A a,b,c x1+x2+x3+x4
B d x5+x6
...
while a is a character value and x is numeric value
I have been trying using sapply, rowsum and
df2<- aggregate(df1, list(df1[,1]), FUN= summary)
but it just can't put the character values in a list for each Name.
Can someone help me how to receive df2?
m <- function(x) if(is.numeric(x<- type.convert(x)))sum(x) else toString(unique(x))
aggregate(.~Name,df1,m)
Name C1 Val1
1 A a, b, c 10
2 B d 11
where
df1
Name C1 Val1
1 A a 1
2 A a 2
3 A b 3
4 A c 4
5 B d 5
6 B d 6
This is your df, I give it numbers 1 to 6 in Val1
df <-
structure(list(Name = structure(c(1L, 1L, 1L, 1L, 2L, 2L), .Label = c("A",
"B"), class = "factor"), C1 = structure(c(1L, 1L, 2L, 3L, 4L,
4L), .Label = c("a", "b", "c", "d"), class = "factor"), Val1 = 1:6), row.names = c(NA,
-6L), class = "data.frame")
We just use summarise:
df %>%
group_by(Name) %>%
summarise(C1=paste(unique(C1),collapse=","),Val1=sum(Val1))
# A tibble: 2 x 3
Name C1 Val1
<fct> <chr> <int>
1 A a,b,c 10
2 B d 11
Quick and easy dplyr solution:
library(dplyr)
library(stringr)
df1 %>%
mutate(Val1_num = as.numeric(str_extract(Val1, "\\d+"))) %>%
group_by(Name) %>%
summarise(C1 = paste(unique(C1), collapse = ","),
Val1 = paste(unique(Val1), collapse = "+"),
Val1_num = sum(Val1_num))
#> # A tibble: 2 x 4
#> Name C1 Val1 Val1_num
#> <chr> <chr> <chr> <dbl>
#> 1 A a,b,c x1+x2+x3+x4 10
#> 2 B d x5+x6 11
Or in base:
df2 <- aggregate(df1, list(df1[,1]), FUN = function(x) {
if (all(grepl("\\d", x))) {
sum(as.numeric(gsub("[^[:digit:]]", "", x)))
} else {
paste(unique(x), collapse = ",")
}
})
df2
#> Group.1 Name C1 Val1
#> 1 A A a,b,c 10
#> 2 B B d 11
data
df1 <- read.csv(text = "
Name,C1,Val1
A,a,x1
A,a,x2
A,b,x3
A,c,x4
B,d,x5
B,d,x6", stringsAsFactors = FALSE)

R: frequency with group by ID [duplicate]

This question already has answers here:
Frequency count of two column in R
(8 answers)
Closed 6 years ago.
I have a data frame like this:
ID Cont
1 a
1 a
1 b
2 a
2 c
2 d
I need to report the frequence of "Cont" by ID. The output should be
ID Cont Freq
1 a 2
1 b 1
2 a 1
2 c 1
2 d 1
Using dplyr, you can group_by both ID and Cont and summarise using n() to get Freq:
library(dplyr)
res <- df %>% group_by(ID,Cont) %>% summarise(Freq=n())
##Source: local data frame [5 x 3]
##Groups: ID [?]
##
## ID Cont Freq
## <int> <fctr> <int>
##1 1 a 2
##2 1 b 1
##3 2 a 1
##4 2 c 1
##5 2 d 1
Data:
df <- structure(list(ID = c(1L, 1L, 1L, 2L, 2L, 2L), Cont = structure(c(1L,
1L, 2L, 1L, 3L, 4L), .Label = c("a", "b", "c", "d"), class = "factor")), .Names = c("ID",
"Cont"), class = "data.frame", row.names = c(NA, -6L))
## ID Cont
##1 1 a
##2 1 a
##3 1 b
##4 2 a
##5 2 c
##6 2 d
library(data.table)
setDT(x)[, .(Freq = .N), by = .(ID, Cont)]
# ID Cont Freq
# 1: 1 a 2
# 2: 1 b 1
# 3: 2 a 1
# 4: 2 c 1
# 5: 2 d 1
With base R:
df1 <- subset(as.data.frame(table(df)), Freq != 0)
if you want to order by ID, add this line:
df1[order(df1$ID)]
ID Cont Freq
1 1 a 2
3 1 b 1
2 2 a 1
6 2 c 1
8 2 d 1

Resources