Count values of the whole dataframe - r

I have this dataframe:
> df
X1 X2 X3 X4 X5 X6 X7
1 2 7 2 3 5 6 7
2 4 2 3 6 1 NA 3
3 3 6 4 4 4 7 7
4 6 5 6 NA 3 1 7
5 1 1 2 3 3 3 7
6 4 7 2 4 5 4 2
7 5 NA 4 5 2 2 3
8 3 7 2 4 4 1 5
9 4 5 6 2 5 6 3
10 2 4 6 4 5 6 3
And I want to count the numbers 1,2,3,4 and assign it to x, 6,7 and assign it to y, and all the numbers (1,2,3,4,5,6,7) to z. After this, I will compute y/z - x/z.
I've done it with table(unlist(df)) and after assigning the value individually. However, I'm looking for a solution without a loop or apply(), as I can't see a way to escalate them as I have near 100 columns and 10.000 rows (I know that all of them are integers from 1 to 7 and NA values).
I'm looking for a solution like this:
x <- count(df, c(1,2,3,4), na.rm = TRUE)
y <- count(df, c(6,7), na.rm = TRUE)
z <- count(df, c(1,2,3,4,5,6,7), na.rm = TRUE)
However, it seems that count() doesn't work like that neither exist a function that does that.
Any suggestions?

A base R solution.
vec <- unlist(df)
vec_c <- table(vec)
x <- sum(vec_c[names(vec_c) %in% as.character(1:4)])
y <- sum(vec_c[names(vec_c) %in% as.character(6:7)])
z <- sum(vec_c)
y/z - x/z
# [1] -0.358209
Another idea.
vec <- unlist(df)
x <- sum(vec %in% 1:4)
y <- sum(vec %in% 6:7)
z <- length(vec[!is.na(vec)])
y/z - x/z
# [1] -0.358209
Another idea.
m <- as.matrix(df)
x <- sum(m %in% 1:4)
y <- sum(m %in% 6:7)
z <- sum(!is.na(df))
y/z - x/z
# [1] -0.358209
DATA
df <- read.table(text = " X1 X2 X3 X4 X5 X6 X7
1 2 7 2 3 5 6 7
2 4 2 3 6 1 NA 3
3 3 6 4 4 4 7 7
4 6 5 6 NA 3 1 7
5 1 1 2 3 3 3 7
6 4 7 2 4 5 4 2
7 5 NA 4 5 2 2 3
8 3 7 2 4 4 1 5
9 4 5 6 2 5 6 3
10 2 4 6 4 5 6 3",
header = TRUE)

Here is an option using tidyverse
library(tidyverse)
gather(df, na.rm = TRUE) %>%
count(value) %>%
mutate(n1 = sum(n)) %>%
filter(value %in% c(1:4, 6:7)) %>%
group_by(grp = value %in% 1:4) %>%
summarise(perc = sum(n)/first(n1)) %>%
summarise(z = diff(perc))
# A tibble: 1 x 1
# z
# <dbl>
# 1 0.358

Another approach sticking on table(), putting your counting structure into a list.
count <- setNames(lapply(list(1:4, 6:7, 1:7), function(x){
tab <- table(unlist(d))
return(sum(tab[x]))
}), tail(letters, 3))
> with(count, y/z - x/z)
[1] -0.358209
Data
d <- structure(list(X1 = c(2L, 4L, 3L, 6L, 1L, 4L, 5L, 3L, 4L, 2L),
X2 = c(7L, 2L, 6L, 5L, 1L, 7L, NA, 7L, 5L, 4L), X3 = c(2L,
3L, 4L, 6L, 2L, 2L, 4L, 2L, 6L, 6L), X4 = c(3L, 6L, 4L, NA,
3L, 4L, 5L, 4L, 2L, 4L), X5 = c(5L, 1L, 4L, 3L, 3L, 5L, 2L,
4L, 5L, 5L), X6 = c(6L, NA, 7L, 1L, 3L, 4L, 2L, 1L, 6L, 6L
), X7 = c(7L, 3L, 7L, 7L, 7L, 2L, 3L, 5L, 3L, 3L)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10"))

Related

Remove duplicate rows in R and add entries of removed rows to kept row

I have data like this:
df <-
a b c
1 2 3
1 2 4
1 2 5
1 2 9
2 3 3
2 3 4
2 3 5
2 3 9
3 4 3
3 4 4
3 4 5
3 4 9
I want to remove duplicate rows base on column a but keep the values in column c as in:
df2 <-
a b c c1 c2 c3
1 2 3 4 5 9
2 3 3 4 5 9
3 4 3 4 5 9
I know how to remove duplicates as in :
df2 <-df[!(df$a=="1"),]
But have now idea how to add the values to the kept row.
We can exclude c while subsettting the rows of the dataset, unlist, and then concatenate with the whole 'c' column
c(unlist(df[!duplicated(df$a), 1:2]), c = df$c)
# a b c1 c2 c3 c4
# 1 2 3 4 5 9
If we need the same names as in the expected
c(unlist(df[!duplicated(df$a), 1:2]),
setNames(df$c, make.unique(rep('c', nrow(df)), sep="")))
# a b c c1 c2 c3
# 1 2 3 4 5 9
With the new example
library(dplyr)
library(tidyr)
df2 %>%
group_by(a) %>%
summarise(b = first(b), c = list(as.list(c))) %>%
unnest_wider(c(c))%>%
rename_at(vars(starts_with('.')), ~ str_c('c', seq_along(.)))
# A tibble: 2 x 6
# a b c1 c2 c3 c4
# <int> <int> <int> <int> <int> <int>
#1 1 2 3 4 5 9
#2 2 2 3 4 5 9
Or with again updated example
df3 %>%
group_by(a) %>%
summarise(b = first(b), c = list(as.list(c))) %>%
unnest_wider(c(c))%>%
rename_at(vars(starts_with('.')), ~ str_c('c', seq_along(.)))
# A tibble: 3 x 6
# a b c1 c2 c3 c4
# <int> <int> <int> <int> <int> <int>
#1 1 2 3 4 5 9
#2 2 3 3 4 5 9
#3 3 4 3 4 5 9
Or with data.table
library(data.table)
setDT(df3)[, c(.(b = first(b)),
as.data.frame.list(setNames(c, rep('c', .N)))), a]
# a b c c.1 c.2 c.3
#1: 1 2 3 4 5 9
#2: 2 3 3 4 5 9
#3: 3 4 3 4 5 9
data
df <- structure(list(a = c(1L, 1L, 1L, 1L), b = c(2L, 3L, 3L, 4L),
c = c(3L, 4L, 5L, 9L)), class = "data.frame", row.names = c(NA,
-4L))
df2 <- structure(list(a = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), b = c(2L,
3L, 3L, 4L, 2L, 3L, 3L, 4L), c = c(3L, 4L, 5L, 9L, 3L, 4L, 5L,
9L)), class = "data.frame", row.names = c(NA, -8L))
df3 <- structure(list(a = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L,
3L, 3L), b = c(2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L
), c = c(3L, 4L, 5L, 9L, 3L, 4L, 5L, 9L, 3L, 4L, 5L, 9L)), class = "data.frame", row.names = c(NA,
-12L))

Create a calculated field in R at each record/row level

I have the below dataframe from which I intend to create a calculated field at each Code level or row level.
Code count_pol const_q
A028 12 3
B09 7 4
M017 5 2
S83 4 1
S1960 6 4
S179 2 2
S193 3 3
IN the above dataset, I want to create a calculated field y for which the following conditions apply:
If for a code the count_pol lies in 1,2,3 , y = count_pol/const_q else const_q/4
Thus the expected output is:
Code count_pol const_q y
A028 12 3 0.75
B09 7 4 1
M017 5 2 0.5
S83 4 1 0.25
S1960 6 4 1
S179 2 2 1
S193 3 3 1
I have tried the below code:
a_df <- mutate(a_df,
y = if_else(count_pol %in% c(1:3), as.integer(const_q)/count_pol,const_q/4))
but that does not give the desired output.
Can someone please help me rectify this?
We can use if_else to check for values in 1:3
library(dplyr)
df %>% mutate(y = if_else(count_pol %in% 1:3, count_pol/const_q, const_q/4))
# Code count_pol const_q y
#1 A028 12 3 0.75
#2 B09 7 4 1.00
#3 M017 5 2 0.50
#4 S83 4 1 0.25
#5 S1960 6 4 1.00
#6 S179 2 2 1.00
#7 S193 3 3 1.00
and in base R that would be
transform(df, y = ifelse(count_pol %in% 1:3, count_pol/const_q, const_q/4))
data
df <- structure(list(Code = structure(c(1L, 2L, 3L, 7L, 6L, 4L, 5L),
.Label = c("A028", "B09", "M017", "S179", "S193", "S1960", "S83"),
class = "factor"), count_pol = c(12L, 7L, 5L, 4L, 6L, 2L, 3L), const_q = c(3L,
4L, 2L, 1L, 4L, 2L, 3L)), class = "data.frame", row.names = c(NA, -7L))
With case_when() ...
df %>%
group_by(code) %>%
mutate(
y = case_when(
count_pol %in% c(1, 2, 3) ~ count_pol/const_q,
TRUE ~ const_q/4
)
)

Finding difference between specific rows by group

Within a group, I want to find the difference between that row and the first time that user appeared in the data. For example, I need to create the diff variable below. Users have different number of rows each as in the following data:
df <- structure(list(ID = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 4L, 4L),
money = c(9L, 12L, 13L, 15L, 5L, 7L, 8L, 5L, 2L, 10L), occurence = c(1L,
2L, 3L, 4L, 1L, 2L, 3L, 1L, 1L, 2L), diff = c(NA, 3L, 4L,
6L, NA, 2L, 3L, NA, NA, 8L)), .Names = c("ID", "money", "occurence",
"diff"), class = "data.frame", row.names = c(NA, -10L))
ID money occurence diff
1 1 9 1 NA
2 1 12 2 3
3 1 13 3 4
4 1 15 4 6
5 2 5 1 NA
6 2 7 2 2
7 2 8 3 3
8 3 5 1 NA
9 4 2 1 NA
10 4 10 2 8
You can use ave(). We just remove the first value per group and replace it with NA, and subtract the first value from the rest of the values.
with(df, ave(money, ID, FUN = function(x) c(NA, x[-1] - x[1])))
# [1] NA 3 4 6 NA 2 3 NA NA 8
A dplyr solution, which uses the first function to get the first value and calculate the difference.
library(dplyr)
df2 <- df %>%
group_by(ID) %>%
mutate(diff = money - first(money)) %>%
mutate(diff = replace(diff, diff == 0, NA)) %>%
ungroup()
df2
# # A tibble: 10 x 4
# ID money occurence diff
# <int> <int> <int> <int>
# 1 1 9 1 NA
# 2 1 12 2 3
# 3 1 13 3 4
# 4 1 15 4 6
# 5 2 5 1 NA
# 6 2 7 2 2
# 7 2 8 3 3
# 8 3 5 1 NA
# 9 4 2 1 NA
# 10 4 10 2 8
Update
Here is a data.table solution provided by Sotos. Notice that no need to replace 0 with NA.
library(data.table)
setDT(df)[, money := money - first(money), by = ID][]
# ID money occurence diff
# 1: 1 0 1 NA
# 2: 1 3 2 3
# 3: 1 4 3 4
# 4: 1 6 4 6
# 5: 2 0 1 NA
# 6: 2 2 2 2
# 7: 2 3 3 3
# 8: 3 0 1 NA
# 9: 4 0 1 NA
# 10: 4 8 2 8
DATA
dput(df)
structure(list(ID = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 4L, 4L),
money = c(9L, 12L, 13L, 15L, 5L, 7L, 8L, 5L, 2L, 10L), occurence = c(1L,
2L, 3L, 4L, 1L, 2L, 3L, 1L, 1L, 2L)), .Names = c("ID", "money",
"occurence"), row.names = c(NA, -10L), class = "data.frame")

Frequency of values per column in table

What is a good way to get the independent frequency counts of multiple columns using dplyr? I want to go from a table of values:
# A tibble: 7 x 4
a b c d
<int> <int> <int> <int>
1 1 2 1 3
2 1 2 1 3
3 2 2 5 3
4 3 2 4 3
5 3 3 2 3
6 5 3 4 3
7 5 4 2 1
to a frequency table like so:
# A tibble: 5 x 5
x a_n b_n c_n d_n
<int> <int> <int> <int> <int>
1 1 2 0 2 1
2 2 1 4 2 0
3 3 2 2 0 6
4 4 0 1 2 0
5 5 2 0 1 0
I'm still trying to get my head around dplyr, but it seems like this is something it could do. If it is easier to do with an add-on library, that is fine too.
For the same data set that you provided in the question this would be another solution (base-R):
myfreq <- sapply(df, function(x) table(factor(x, levels=unique(unlist(df)), ordered=TRUE)))
Output would be:
> myfreq
# a b c d
# 1 2 0 2 1
# 2 1 4 2 0
# 3 2 2 0 6
# 5 2 0 1 0
# 4 0 1 2 0
Using tabulate in base R:
apply(df,2,function(x) tabulate(x)[min(df):max(df)])
# a b c d
#[1,] 2 0 2 1
#[2,] 1 4 2 0
#[3,] 2 2 0 6
#[4,] 0 1 2 NA
#[5,] 2 NA 1 NA
library(dplyr)
library(reshape2)
df %>%
melt() %>%
dcast(value ~ variable, fun.aggregate=length)
# value a b c d
# 1 1 2 0 2 1
# 2 2 1 4 2 0
# 3 3 2 2 0 6
# 4 4 0 1 2 0
# 5 5 2 0 1 0
Data
df <- structure(list(a = c(1L, 1L, 2L, 3L, 3L, 5L, 5L), b = c(2L, 2L,
2L, 2L, 3L, 3L, 4L), c = c(1L, 1L, 5L, 4L, 2L, 4L, 2L), d = c(3L,
3L, 3L, 3L, 3L, 3L, 1L)), .Names = c("a", "b", "c", "d"), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7"))
library(tidyverse)
dt <- data.frame(a = c(1L, 1L, 2L, 3L, 3L, 5L, 5L), b = c(2L, 2L, 2L, 2L, 3L, 3L, 4L),
c = c(1L, 1L, 5L, 4L, 2L, 4L, 2L), d = c(3L, 3L, 3L, 3L, 3L, 3L, 1L))
dt2 <- dt %>%
mutate(ID = 1:n()) %>%
gather(Group, x, -ID) %>%
select(-ID) %>%
mutate(Group = paste(Group, "n", sep = "_")) %>%
count(Group, x) %>%
spread(Group, n, fill = 0L)

R Aggregate and count of not null

I have the following data table
PIECE SAMPLE QC_CODE
1 1 1
2 1 NA
3 2 2
4 2 4
5 2 NA
6 3 6
7 3 3
8 3 NA
9 4 6
10 4 NA
and I would like to count the number of qc_code in each sample and return an output like this
SAMPLE SAMPLE_SIZE QC_CODE_COUNT
1 2 1
2 3 2
3 3 2
4 2 1
Where sample size is the count of pieces in each sample, and qc_code_count is the count of al qc_code that are no NA.
How would I go about this in R
You can try
library(dplyr)
df1 %>%
group_by(SAMPLE) %>%
summarise(SAMPLE_SIZE=n(), QC_CODE_UNIT= sum(!is.na(QC_CODE)))
# SAMPLE SAMPLE_SIZE QC_CODE_UNIT
#1 1 2 1
#2 2 3 2
#3 3 3 2
#4 4 2 1
Or
library(data.table)
setDT(df1)[,list(SAMPLE_SIZE=.N, QC_CODE_UNIT=sum(!is.na(QC_CODE))), by=SAMPLE]
Or using aggregate from base R
do.call(data.frame,aggregate(QC_CODE~SAMPLE, df1, na.action=NULL,
FUN=function(x) c(SAMPLE_SIZE=length(x), QC_CODE_UNIT= sum(!is.na(x)))))
data
df1 <- structure(list(PIECE = 1:10, SAMPLE = c(1L, 1L, 2L, 2L, 2L, 3L,
3L, 3L, 4L, 4L), QC_CODE = c(1L, NA, 2L, 4L, NA, 6L, 3L, NA,
6L, NA)), .Names = c("PIECE", "SAMPLE", "QC_CODE"), class = "data.frame",
row.names = c(NA, -10L))

Resources