Counting occurencies in every column in R - r

Hello I need to count the occurencies of every number in each column.
Example data-frame:
A B C
2 1 2
2 1 1
1 1 3
3 3 3
3 2 2
2 1 2
I want my output to look like this
how_much A B C
1 1 4 1
2 3 1 3
3 2 1 2

In tidyverse you could do:
library(tidyverse)
gather(df1) %>%
group_by(key,value) %>%
count() %>%
pivot_wider(value, names_from = key, values_from = n, values_fill = 0)
value A B C
<int> <int> <int> <int>
1 1 1 4 1
2 2 3 1 3
3 3 2 1 2

We can use table
table(unlist(df1), names(df1)[c(col(df1))])
-output
A B C
1 1 4 1
2 3 1 3
3 2 1 2
Or loop over the columns with sapply, and apply table
sapply(df1, table)
A B C
1 1 4 1
2 3 1 3
3 2 1 2
data
df1 <- structure(list(A = c(2L, 2L, 1L, 3L, 3L, 2L), B = c(1L, 1L, 1L,
3L, 2L, 1L), C = c(2L, 1L, 3L, 3L, 2L, 2L)),
class = "data.frame", row.names = c(NA,
-6L))

In order for the solution to be more flexible and can be used for any occurrence of numbers we can use the following solution using purrr package functions.
library(dplyr)
library(purrr)
df1 %>%
map(~ unique(.x) %>% sort()) %>% reduce(~ union(..1, ..2)) %>%
bind_cols(map_dfr(., ~ map_dfc(df1, function(a) sum(a == .x)))) %>%
rename(what = ...1)
# A tibble: 3 x 4
what A B C
<int> <int> <int> <int>
1 1 1 4 1
2 2 3 1 3
3 3 2 1 2

A slightly verbose answer, but it will work on all data types.
set.seed(1234)
df1 <- data.frame(A = sample(letters[1:3], 8, T),
B = sample(letters[1:3], 8, T),
C = sample(letters[1:3], 8, T))
df1
#> A B C
#> 1 b c b
#> 2 b b a
#> 3 a b c
#> 4 c b c
#> 5 a c c
#> 6 a b a
#> 7 b b b
#> 8 b b a
library(tidyverse)
unique(unlist(apply(df1, 1, unique))) %>% as.data.frame() %>% setNames('how_much') %>%
bind_cols(map_df(unique(unlist(apply(df1, 1, unique))), ~map_int(df1, \(x) sum(x %in% .x) ) ))
#> how_much A B C
#> 1 b 4 6 2
#> 2 c 1 2 3
#> 3 a 3 0 3
Created on 2021-06-23 by the reprex package (v2.0.0)

Related

Identify unique values within a multivariable subset

I have data that look like these:
Subject Site Date
1 2 '2020-01-01'
1 2 '2020-01-01'
1 2 '2020-01-02'
2 1 '2020-01-02'
2 1 '2020-01-03'
2 1 '2020-01-03'
And I'd like to create an order variable for unique dates by Subject and Site. i.e.
Want
1
1
2
1
2
2
I define a little wrapper:
rle <- function(x) cumsum(!duplicated(x))
and I notice inconsistent behavior when I supply:
have1 <- unlist(tapply(val$Date, val[, c( 'Site', 'Subject')], rle))
versus
have2 <- unlist(tapply(val$Date, val[, c('Subject', 'Site')], rle))
> have1
[1] 1 1 2 1 2 2
> have2
[1] 1 2 2 1 1 2
Is there any way to ensure that the natural ordering of the dataset is followed regardless of the specific columns supplied to the INDEX argument?
library(dplyr)
val %>%
group_by(Subject, Site) %>%
mutate(Want = match(Date, unique(Date))) %>%
ungroup
-output
# A tibble: 6 × 4
Subject Site Date Want
<int> <int> <chr> <int>
1 1 2 2020-01-01 1
2 1 2 2020-01-01 1
3 1 2 2020-01-02 2
4 2 1 2020-01-02 1
5 2 1 2020-01-03 2
6 2 1 2020-01-03 2
val$Want <- with(val, ave(as.integer(as.Date(Date)), Subject, Site,
FUN = \(x) match(x, unique(x))))
val$Want
[1] 1 1 2 1 2 2
data
val <- structure(list(Subject = c(1L, 1L, 1L, 2L, 2L, 2L), Site = c(2L,
2L, 2L, 1L, 1L, 1L), Date = c("2020-01-01", "2020-01-01", "2020-01-02",
"2020-01-02", "2020-01-03", "2020-01-03")),
class = "data.frame", row.names = c(NA,
-6L))

Using complete to fill groups with NA to have same length as the maximum group

I have this dataframe:
df <- structure(list(id = c(1L, 1L, 1L, 2L, 2L, 3L), var = c("A", "B",
"C", "B", "C", "C")), class = "data.frame", row.names = c(NA,
-6L))
id var
1 1 A
2 1 B
3 1 C
4 2 B
5 2 C
6 3 C
I would like to get this dataframe:
id var
1 1 A
2 1 B
3 1 C
4 2 <NA>
5 2 B
6 2 C
7 3 <NA>
8 3 <NA>
9 3 C
I would like to learn how to use complete or expand.grid in this situation
I have tried several ways but was not successful: One of my tries:
df %>%
complete(id, var, fill=list(NA))
Create a duplicate column of 'var' and then do the complete on the other column, which makes the NA in the 'var' column and then remove the duplicate 'var' column
library(dplyr)
library(tidyr)
df %>%
mutate(var1 = var) %>%
complete(id, var1) %>%
select(-var1)
-output
# A tibble: 9 × 2
id var
<int> <chr>
1 1 A
2 1 B
3 1 C
4 2 <NA>
5 2 B
6 2 C
7 3 <NA>
8 3 <NA>
9 3 C

Random sampling in R within Categorical variable

Suppose I have a data frame with categorical variable of n classes and a numerical variable. I need to randomize the numerical variable within each category. For example , consider the following table:
Col_1 Col_2
A 2
A 5
A 4
A 8
B 1
B 4
B 9
B 7
When I tried sample() function in R, it threw the result considering both the categories. Is there any function where I can get this kind of output? (with or without replacement, doesn't matter)
Col_1 Col_2
A 8
A 4
A 2
A 5
B 9
B 7
B 4
B 1
You could sample row numbers within groups. In base R, we can use ave
df[with(df, ave(seq_len(nrow(df)), Col_1, FUN = sample)), ]
# Col_1 Col_2
#2 A 5
#4 A 8
#1 A 2
#3 A 4
#7 B 9
#5 B 1
#8 B 7
#6 B 4
In dplyr, we can use sample_n
library(dplyr)
df %>% group_by(Col_1) %>% sample_n(n())
data
df <- structure(list(Col_1 = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L), .Label = c("A", "B"), class = "factor"), Col_2 = c(2L, 5L,
4L, 8L, 1L, 4L, 9L, 7L)), class = "data.frame", row.names = c(NA, -8L))
Here's a dplyr solution:
library(dplyr)
set.seed(2)
dat %>%
group_by(Col_1) %>%
mutate(Col_2 = sample(Col_2)) %>%
ungroup()
# # A tibble: 8 x 2
# Col_1 Col_2
# <chr> <int>
# 1 A 2
# 2 A 4
# 3 A 5
# 4 A 8
# 5 B 7
# 6 B 9
# 7 B 1
# 8 B 4
A data.table method:
library(data.table)
datDT <- as.data.table(dat)
set.seed(2)
datDT[, Col_2 := sample(Col_2), by = "Col_1"]
datDT
# Col_1 Col_2
# 1: A 2
# 2: A 4
# 3: A 5
# 4: A 8
# 5: B 7
# 6: B 9
# 7: B 1
# 8: B 4
Data
dat <- read.table(header = TRUE, stringsAsFactors = FALSE, text = "
Col_1 Col_2
A 2
A 5
A 4
A 8
B 1
B 4
B 9
B 7")

Sorting and calculating sum and rank with new columns in R

I have 200 columns and want to calculate mean and rank and then generate columns. Here is an example of data
df<-read.table(text="Q1a Q2a Q3b Q4c Q5a Q6c Q7b
1 2 4 2 2 0 1
3 2 1 2 2 1 1
4 3 2 1 1 1 1",h=T)
I want to sum a, b and c for each row, and then sum them together. Next I want to calculate the rank for each row. I want to generate the following table:
Q1a Q2a Q3b Q4c Q5a Q6c Q7b a b c Total Rank
1 2 4 2 2 0 1 5 5 2 12 2
3 2 1 2 2 1 1 7 2 3 12 2
4 3 2 1 1 1 1 8 3 2 13 1
library(dplyr)
df %>%
cbind(sapply(c('a', 'b', 'c'), function(x) rowSums(.[, grep(x, names(.)), drop=FALSE]))) %>%
mutate(Total = a + b + c,
Rank = match(Total, sort(Total, decreasing = T)))
Output is:
Q1a Q2a Q3b Q4c Q5a Q6c Q7b a b c Total Rank
1 1 2 4 2 2 0 1 5 5 2 12 2
2 3 2 1 2 2 1 1 7 2 3 12 2
3 4 3 2 1 1 1 1 8 3 2 13 1
Sample data:
df <- structure(list(Q1a = c(1L, 3L, 4L), Q2a = c(2L, 2L, 3L), Q3b = c(4L,
1L, 2L), Q4c = c(2L, 2L, 1L), Q5a = c(2L, 2L, 1L), Q6c = c(0L,
1L, 1L), Q7b = c(1L, 1L, 1L)), class = "data.frame", row.names = c(NA,
-3L))
You can also go with the tidyverse approach. However, it is longer.
library(tidyverse)
df %>%
rownames_to_column(var = "ID") %>%
gather(question, value, -ID) %>%
mutate(type = substr(question, 3,3)) %>%
group_by(ID, type) %>%
summarise(sumType = sum(value, na.rm = TRUE)) %>%
as.data.frame() %>%
spread(type, sumType) %>%
mutate(Total = a+b+c,
Rank = match(Total, sort(Total, decreasing = T)))
Results:
ID a b c Total Rank
1 1 5 5 2 12 2
2 2 7 2 3 12 2
3 3 8 3 2 13 1

R: frequency with group by ID [duplicate]

This question already has answers here:
Frequency count of two column in R
(8 answers)
Closed 6 years ago.
I have a data frame like this:
ID Cont
1 a
1 a
1 b
2 a
2 c
2 d
I need to report the frequence of "Cont" by ID. The output should be
ID Cont Freq
1 a 2
1 b 1
2 a 1
2 c 1
2 d 1
Using dplyr, you can group_by both ID and Cont and summarise using n() to get Freq:
library(dplyr)
res <- df %>% group_by(ID,Cont) %>% summarise(Freq=n())
##Source: local data frame [5 x 3]
##Groups: ID [?]
##
## ID Cont Freq
## <int> <fctr> <int>
##1 1 a 2
##2 1 b 1
##3 2 a 1
##4 2 c 1
##5 2 d 1
Data:
df <- structure(list(ID = c(1L, 1L, 1L, 2L, 2L, 2L), Cont = structure(c(1L,
1L, 2L, 1L, 3L, 4L), .Label = c("a", "b", "c", "d"), class = "factor")), .Names = c("ID",
"Cont"), class = "data.frame", row.names = c(NA, -6L))
## ID Cont
##1 1 a
##2 1 a
##3 1 b
##4 2 a
##5 2 c
##6 2 d
library(data.table)
setDT(x)[, .(Freq = .N), by = .(ID, Cont)]
# ID Cont Freq
# 1: 1 a 2
# 2: 1 b 1
# 3: 2 a 1
# 4: 2 c 1
# 5: 2 d 1
With base R:
df1 <- subset(as.data.frame(table(df)), Freq != 0)
if you want to order by ID, add this line:
df1[order(df1$ID)]
ID Cont Freq
1 1 a 2
3 1 b 1
2 2 a 1
6 2 c 1
8 2 d 1

Resources