Count and merge NA rows in R - r

I have the following dataframe:
var1 <- c("a", "b", "c", "d", "e")
var2 <- c(5, 10, NA, NA, NA)
df <- data.frame (var1, var2)
df
# A tibble: 5 × 2
var1 var2
<chr> <dbl>
1 a 5
2 b 10
3 c NA
4 d NA
5 e NA
I would like to count and merge the NA rows. Expected output:
# A tibble: 3 × 2
var1 var2
<chr> <dbl>
1 a 5
2 b 10
3 x 3
I have tried aggregate(data=df,var2~.,na.rm = FALSE, FUN = sum) but it only returns the results for a and b.
Thank you in advance

With dplyr
df %>%
mutate(var1 = ifelse(is.na(var2), "x", var1),
var2 = ifelse(var1=="x", sum(is.na(var2) & var1 == "x"),
var2)
) %>%
unique()
var1 var2
1 a 1
2 b 2
3 c 3
4 x 2
Data
df <- structure(list(var1 = c("a", "b", "c", "d", "e"), var2 = c(1,
2, 3, NA, NA)), class = "data.frame", row.names = c(NA, -5L))

Try this with Base R
s <- df[complete.cases(df) , ]
s[nrow(s)+1 ,] <- c("x" , sum(is.na(df$var2) == T))
s
output
var1 var2
1 a 5
2 b 10
3 x 3

Using aggregate:
df$var1 <- ifelse(is.na(df$var2), "x", df$var1)
aggregate(data = df, var2 ~ .,
FUN = \(x) if (!all(is.na(x))) sum(x) else length(x), na.action = NULL)
#> var1 var2
#> 1 a 5
#> 2 b 10
#> 3 x 3

Another approach with dplyr
df %>%
mutate(var1 = ifelse(is.na(var2), "x", var1),
var2 = ifelse(is.na(var2), 1, var2)) %>%
count(var1, wt=var2, name="var2")

Related

I´m looking for a way to inverse the values of 2 columns of a row when this inverse exists in the dataframe?

Here is my dataframe:
DF <- data.frame(
VAR1 = c("A", "A", "B", "B", "B", "C", "C"),
VAR2 = c("B", "C", "A", "D", "C", "B", "D"),
VAR3 = c(1, 1, 1, 2, 4, 6, 4)
)
I would like to have this:
VAR1 VAR2 VAR3
A B 2
A C 1
B D 2
B C 10
C D 4
If There is two rows like (VAR1=A, VAR2=B, VAR3=X) and (VAR2=B, VAR1=A, VAR3=Y), I want to have one row like this one (VAR1=A, VAR2=B, VAR3=X+Y). So if the two first variables are "inverse", I would like to have one row with the sum of them.
I tried to have a column which says "Yes" if two rows have inverse values but I can´t find a way to do it.
My code:
DF <- DF %>%
mutate(VAR4 = case_when(VAR2 %in% DF$VAR1 &
VAR1 %in%
(DF %>%
filter(VAR1 == VAR2) %>%
pull(VAR2)
) ~ "Yes",
TRUE ~ 'No' ))
`
This is the result:
VAR1 VAR2 VAR3 VAR4
A B 1 No
A C 1 No
B A 1 No
B D 2 No
B C 4 No
C B 6 No
C D 4 No
My code doesn´t work because my filter doesn´t take the result of VAR2 %in% DF$VAR1 in account.
Does someone have an idea?
You can sort first with apply, and then summarise:
DF[1:2] <- t(apply(DF[1:2], 1, sort))
DF %>%
group_by(VAR1, VAR2) %>%
summarise(VAR3 = sum(VAR3))
# A tibble: 5 × 3
# Groups: VAR1 [3]
VAR1 VAR2 VAR3
<chr> <chr> <dbl>
1 A B 2
2 A C 1
3 B C 10
4 B D 2
5 C D 4
Or, in single pipe:
DF %>%
mutate(VAR = pmap(., ~ sort(c(..1, ..2)) %>%
set_names(., c("VAR1", "VAR2")))) %>%
group_by(VAR) %>%
summarise(VAR3 = sum(VAR3)) %>%
unnest_wider(VAR)
You could try:
library(dplyr)
DF %>%
mutate(across(VAR1:VAR2, as.character)) %>%
group_by(idx1 = pmin(VAR1, VAR2), idx2 = pmax(VAR1, VAR2)) %>%
summarise(VAR3 = sum(VAR3)) %>%
rename_with(~ sub('idx', 'VAR', .)) %>%
ungroup
Output:
# A tibble: 5 x 3
VAR1 VAR2 VAR3
<chr> <chr> <dbl>
1 A B 2
2 A C 1
3 B C 10
4 B D 2
5 C D 4

How to count rows with NA values across a selection of columns and include 0 count?

I am trying to count the number of species per region which have missing data (NA) for a selection of variables.
Here is an example of my dataframe:
library(tidyverse)
df <- structure(
list(
ID = c("AL01", "AL01", "AL02", "AL02", "AL03", "AL03"),
Species = c("Sp1",
"Sp2",
"Sp3",
"Sp4",
"Sp5",
"Sp6"),
Var1 = c("A", NA, NA, NA, "B", "B"),
Var2 = c(NA,
"A",
"B",
"C",
"B",
"C"),
Var3 = c(NA,
2.71, 2.86, 3.21, 2.87, 3.05),
Var4 = c("S", NA,
"C", NA, "S",
"C")
),
class = "data.frame",
row.names = c(NA,
6L)
)
I can get the count of species with NA for any of Var2, Var3 of Var4 by running:
df %>%
filter_at(
vars(
Var2,
Var3,
Var4
),
any_vars(is.na(.))
) %>%
group_by(ID) %>%
count()
# A tibble: 2 × 2
# Groups: ID [2]
ID n
<chr> <int>
1 AL01 2
2 AL02 1
However this only shows me AL01 and AL02 and I would also like to include AL03 for which the count is 0. I have tried this code which I thought should work:
df %>%
group_by(ID) %>%
summarise_at(vars(
Var2,
Var3,
Var4
), ~ sum(any_vars(is.na(.))))
But I get this error:
Error in `summarise()`:
! Problem while computing `Var2 = (structure(function (..., .x = ..1, .y = ..2, . =
..1) ...`.
ℹ The error occurred in group 1: ID = "AL01".
Caused by error in `abort_quosure_op()`:
! Summary operations are not defined for quosures. Do you need to unquote the
quosure?
# Bad: sum(myquosure)
# Good: sum(!!myquosure)
Run `rlang::last_error()` to see where the error occurred.
I realise I am not sure exactly how any_vars works and am unclear on how to continue. The output I would like would be:
# A tibble: 2 × 2
# Groups: ID [2]
ID n
<chr> <int>
1 AL01 2
2 AL02 1
3 AL03 0
You can do:
library(tidyverse)
df %>%
mutate(missing = apply(across(num_range('Var', 2:4)), 1, function(x) any(is.na(x)))) %>%
group_by(ID) %>%
summarize(n = sum(missing))
# A tibble: 3 x 2
ID n
<chr> <int>
1 AL01 2
2 AL02 1
3 AL03 0
df %>%
rowwise() %>%
mutate(across(num_range('Var', 2:4), is.na),
x = any(c_across(num_range('Var', 2:4)))) %>%
group_by(ID) %>%
summarise(n = sum(x))
# A tibble: 3 × 2
ID n
<chr> <int>
1 AL01 2
2 AL02 1
3 AL03 0

Converting a matrix into a tibble in R

How can I convert this matrix:
> matrix(1:3, nrow = 3, dimnames = list(c("X","Y","Z"), c("A")))
A
X 1
Y 2
Z 3
into this tibble:
> tibble::tribble(~group1, ~group2, ~value, "X", "A", 1, "Y", "A", 2, "Z", "A", 3)
# A tibble: 3 × 3
group1 group2 value
<chr> <chr> <dbl>
1 X A 1
2 Y A 2
3 Z A 3
Thank you
as.tibble can convert the matrix's rownames to a column, and then you can use gather() to create the group2 column:
library(tidyverse)
m <- matrix(1:3, nrow = 3, dimnames = list(c("X","Y","Z"), c("A")))
newtib <- m %>%
as.tibble(rownames = "group1") %>%
gather('A', key = "group2", value = "value")
> newtib
# A tibble: 3 × 3
group1 group2 value
<chr> <chr> <int>
1 X A 1
2 Y A 2
3 Z A 3
> tibble::tribble(~group1, ~group2, ~value, "X", "A", 1, "Y", "A", 2, "Z", "A", 3)
# A tibble: 3 × 3
group1 group2 value
<chr> <chr> <dbl>
1 X A 1
2 Y A 2
3 Z A 3
Easier with base R, if we convert to table and coerce with as.data.frame (if we need to convert to tibble - use as_tibble as wrapper over the as.data.frame
as.data.frame(as.table(m1))
Var1 Var2 Freq
1 X A 1
2 Y A 2
3 Z A 3
data
m1 <- matrix(1:3, nrow = 3, dimnames = list(c("X","Y","Z"), c("A")))
Transform your matrix into a dataframe
bring your rownames to column group1
mutate group2
data.frame(matrix) %>%
rownames_to_column("group1") %>%
mutate(group2 = colnames(matrix)) %>%
dplyr::select(group1, group2, value=A)
group1 group2 value
1 X A 1
2 Y A 2
3 Z A 3
You can use -
library(tidyverse)
mat <- matrix(1:3, nrow = 3, dimnames = list(c("X","Y","Z"), c("A")))
mat %>%
as.data.frame() %>%
rownames_to_column(var = 'group1') %>%
pivot_longer(cols = -group1, names_to = 'group2')
# group1 group2 value
# <chr> <chr> <dbl>
#1 X A 1
#2 Y A 2
#3 Z A 3

Assign a value to a column in R based on a percentage within each group

[]
1I need to create column C in a data frame where 30% of the rows within each group (column B) get a value 0.
How do I do this in R?
We may use rbinom after grouping by 'category' column. Specify the prob as a vector of values
library(dplyr)
df1 %>%
group_by(category) %>%
mutate(value = rbinom(n(), 1, c(0.7, 0.3))) %>%
ungroup
-output
# A tibble: 9 x 3
sno category value
<int> <chr> <int>
1 1 A 1
2 2 A 0
3 3 A 1
4 4 B 1
5 5 B 0
6 6 B 1
7 7 C 1
8 8 C 0
9 9 C 0
data
df1 <- structure(list(sno = 1:9, category = c("A", "A", "A", "B", "B",
"B", "C", "C", "C")), class = "data.frame", row.names = c(NA,
-9L))
If your data already exist (assuming this is a simplified answer), and if you want the value to be randomly assigned to each group:
library(dplyr)
d <- data.frame(sno = 1:9,
category = rep(c("A", "B", "C"), each = 3))
d %>%
group_by(category) %>%
mutate(value = sample(c(rep(1, floor(n()*.7)), rep(0, n() - floor(n()*.7)))))
Base R
set.seed(42)
d$value <- ave(
rep(0, nrow(d)), d$category,
FUN = function(z) sample(0:1, size = length(z), prob = c(0.3, 0.7), replace = TRUE)
)
d
# sno category value
# 1 1 A 0
# 2 2 A 0
# 3 3 A 1
# 4 4 B 0
# 5 5 B 1
# 6 6 B 1
# 7 7 C 0
# 8 8 C 1
# 9 9 C 1
Data copied from Brigadeiro's answer:
d <- structure(list(sno = 1:9, category = c("A", "A", "A", "B", "B", "B", "C", "C", "C")), class = "data.frame", row.names = c(NA, -9L))

Merge data frames and divide rows by group

I would like to divide the values from df1 over the values from df2. In this reproducible example, I am able to sum these values. What about the division? Thanks in advance!
df1 <- data.frame(country = c("a", "b", "c"), year1 = c(1, 2, 3), year2 = c(1, 2, 3))
df2 <- data.frame(country = c("a", "b", "d"), year1 = c(1, 2, NA), year2 = c(1, 2, 3))
df3 <- bind_rows(df1, df2) %>%
mutate_if(is.numeric, tidyr::replace_na, 0) %>%
group_by(country) %>%
summarise_all(., sum, na.rm = TRUE) %>%
na_if(., 0)
Expected result is:
# A tibble: 4 x 3
country year1 year2
<chr> <dbl> <dbl>
1 a 1 1
2 b 1 1
3 c NA NA
4 d NA NA
As there are groups with 2 rows and some with 1, use an if/else condition within summarise/across to divide the first element by the last if there are two elements or else return NA
library(dplyr) # version 1.0.4
library(tidyr)
bind_rows(df1, df2) %>%
mutate(across(where(is.numeric), replace_na, 0)) %>%
group_by(country) %>%
summarise(across(everything(), ~ if(n() == 2) first(.)/last(.)
else NA_real_))
-output
# A tibble: 4 x 3
# country year1 year2
#* <chr> <dbl> <dbl>
#1 a 1 1
#2 b 1 1
#3 c NA NA
#4 d NA NA
Here is a base R option using merge + split.default
df <- merge(df1, df2, by = "country", all = TRUE)
cbind(
df[1],
list2DF(lapply(
split.default(df[-1], gsub("\\.(x|y)", "", names(df)[-1])),
function(v) do.call("/", v)
))
)
which gives
country year1 year2
1 a 1 1
2 b 1 1
3 c NA NA
4 d NA NA

Resources