Count missing values per class

Count missing values per class - r

I am looking to check the pattern of missing values according to a class label (dependent variable) in my data. The output I want is the class labels and the number of missing values in the class.
library(tidyverse)
fakeData <- data.frame(var1 = c(1,2,NA,4,NA,6,7,8,9,10),
var2=c(11,NA,NA,14,NA,16,17,NA,19,NA),
Class = c(rep("A", 5), rep("B", 5)))
fakeData %>% group_by(Class) %>% summarize(numMissing = sum(is.na()))
Error in summarise_impl(.data, dots) :
Evaluation error: 0 arguments passed to 'is.na' which requires 1.
What is wrong with my approach here?

I think this is a cleaner solution, using tidyverse only. You don't need to know the number of columns. You can also use ?select_helpers in gather() to select columns, eg. starts_with("var").
fakeData %>%
group_by(Class) %>%
gather(variable, value, -Class) %>% # all except Class
summarise(missing_n = sum(is.na(value)))
# A tibble: 2 x 2
Class missing_n
<fctr> <int>
1 A 5
2 B 2

Perhaps, we can do
fakeData %>%
group_by(Class) %>%
summarise_all(funs(sum(is.na(.)))) %>%
transmute(Class, numMissing = var1 + var2)
If we have many columns, then use purrr::reduce
fakeData %>%
group_by(Class) %>%
summarise_all(funs(sum(is.na(.)))) %>%
transmute(Class, numMissing = .[-1] %>% reduce(`+`))
#or with rowSums
#transmute(Class, numMissing = rowSums(.[-1]))

I would suggest melting dataset in long format using reshape lib. Then just use aggregate function by Class variable.
library(reshape)
fakeData <- data.frame(var1 = c(1,2,NA,4,NA,6,7,8,9,10),
var2=c(11,NA,NA,14,NA,16,17,NA,19,NA),
Class = c(rep("A", 5), rep("B", 5)))
fData <- melt(fakeData, measure.vars = c("var1", "var2"))
fData
Class variable value
1 A var1 1
2 A var1 2
3 A var1 NA
4 A var1 4
5 A var1 NA
6 B var1 6
7 B var1 7
8 B var1 8
9 B var1 9
10 B var1 10
11 A var2 11
12 A var2 NA
13 A var2 NA
14 A var2 14
15 A var2 NA
16 B var2 16
17 B var2 17
18 B var2 NA
19 B var2 19
20 B var2 NA
with(fData, aggregate(value, list(Class), function(x) { sum(is.na(x)) }))
Group.1 x
1 A 5
2 B 2

Related

How to do a for loop with case_when

I'm a beginner with R and I'm trying to do a for-loop to recode many variables: when "test" modality is missing, then have "test.v1" modality. It looked very easy to do, but I can't get it:
VEC_1 <- c("test1","test2","test3","test4","test5","test6","test7","test8","test9")
VEC_2 <- c("test1.v1","test2.v1","test3.v1","test4.v1","test5.v1","test6.v1","test7.v1","test8.v1","test9.v1")
for (i in 1:(min(length(VEC_1), length(VEC_2)))){
df2 <- df1 %>%
mutate(
VEC_1[i] = case_when(
is.na(VEC_1[i]) & !is.na(VEC_2[i]) ~ VEC_2[i],
TRUE ~ VEC_1[i])
)
}
I have this error
Unexpected error : '=' in:
" mutate(
VEC_1[i] ="
Does anyone have an idea ?
EDIT: df1 is like :
test1 <- c("A","B","A","A",NA,"B","A",NA,"A")
test1.v1 <- c("B",NA,"B","B","A","B","B",NA,"A")
test2 <- c("B","B","B","B",NA,"C","C","C","C")
test2.v1 <- c("C",NA,"A","A","B","B","C",NA,"C")
test3 <- c("A","B","B","B",NA,"C","C",NA,"C")
test3.v1 <- c("B","A","B",NA,"A","A","A","A",NA)
test4 <- c(NA,"B","B","A",NA,"B","A",NA,"A")
test4.v1 <- c("B","B","B","A","A","B","B","B","B")
df1 <- data.frame(test1,test1.v1,test2,test2.v1,test3,test3.v1,test4,test4.v1)

Based on the example data.frame df1, I'm wondering if you might try putting your data into long form, then grouping by row number and test, then substituting missing values.
library(tidyverse)
df1 %>%
mutate(rn = row_number()) %>%
pivot_longer(cols = -rn, names_to = c("test", "mode"), names_pattern = "test(\\d+)([.v1]*)") %>%
group_by(rn, test) %>%
mutate(value = ifelse(mode == "" & is.na(value), value[mode == ".v1"], value)) %>%
pivot_wider(id_cols = rn, names_from = c(test, mode), values_from = value, names_prefix = "test", names_sep = "")
Output
rn test1 test1.v1 test2 test2.v1 test3 test3.v1 test4 test4.v1
<int> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 1 A B B C A B B B
2 2 B NA B NA B A B B
3 3 A B B A B B B B
4 4 A B B A B NA A A
5 5 A A B B A A A A
6 6 B B C B C A B B
7 7 A B C C C A A B
8 8 NA NA C NA A A B B
9 9 A A C C C NA A B

I'm assuming VEC_1 and VEC_2 are the same length here. you don't need to use a for-loop, mutate affects the whole column of the dataframe, it kinda behaves like a for-loop. I have rewritten your code like this and changed the data for testing purposes:
library(dplyr)
VEC_1 <- c("test1","test2","test3","test4",NA,"test6","test7",NA,"test9")
VEC_2 <- c("test1.v1",NA,"test3.v1","test4.v1","test5.v1","test6.v1","test7.v1",NA,"test9.v1")
df <- data.frame(VEC_1,VEC_2)
df %>% mutate(VEC_1 = if_else(is.na(VEC_1) & !is.na(VEC_2),VEC_2,VEC_1))
which is equal to
df <- data.frame(VEC_1,VEC_2)
for (i in 1:nrow(df)){
if(is.na(df$VEC_1[i]) & !is.na(df$VEC_2[i])){
df$VEC_1[i] = df$VEC_2[i]
}
}
Output:
> df
VEC_1 VEC_2
1 test1 test1.v1
2 test2 <NA>
3 test3 test3.v1
4 test4 test4.v1
5 test5.v1 test5.v1
6 test6 test6.v1
7 test7 test7.v1
8 <NA> <NA>
9 test9 test9.v1
i also changed from case_when to if_else, because you only check one condition.
?if_else
if_else(condition, true, false, missing = NULL)

ifelse/if_else will be a good solution, unless you have multiple conditions, then case_when is your friend. A loop isn't necessary:
library(dplyr)
VEC_1 <- c("test1","test2",NA,NA)
VEC_2 <- c("test1.v1",NA,NA,"test4.v1")
df <- tibble(VEC_1, VEC_2)
df %>%
mutate(VEC_1 = case_when(
is.na(VEC_2) ~ VEC_1,
is.na(VEC_1) ~ VEC_2,
TRUE ~ VEC_1)
)
# A tibble: 4 × 2
VEC_1 VEC_2
<chr> <chr>
1 test1 test1.v1
2 test2 NA
3 NA NA
4 NA test4.v1
# A tibble: 4 × 2
VEC_1 VEC_2
<chr> <chr>
1 test1 test1.v1
2 test2 NA
3 NA NA
4 test4.v1 test4.v1

Reshape from unconstructed dataset in r

I am trying to reshape a dataset by switching some cells information. Here is how my sample dataset looks like.
data <- data.frame(var1 = c("Text","A","B","C","D"),
var2 = c("Text",NA, 1,0,1),
var3 = c("112-1",NA,NA,"text",NA),
var4 = c("Text",1,0,NA, NA),
var5 = c("113-1",NA,"text",NA,NA))
> data
var1 var2 var3 var4 var5
1 Text Text 112-1 Text 113-1
2 A <NA> <NA> 1 <NA>
3 B 1 <NA> 0 text
4 C 0 text <NA> <NA>
5 D 1 <NA> <NA> <NA>
It needs some cleaning first.var1 has the item information. var2 and var4 have score information. var3 and var5 have id information at the first row.
I will need to reshape this dataset as below.
> data.1
id A B C D
1 112 NA 1 0 1
2 113 1 0 NA NA
Considering this datafile in multiple columns (e.g. having more columns var6,var7,var8,var9,.etc) with the same pattern, How can I reshape to this desired dataset?

This isn't much different from my answer yesterday, but this will give you the result you asked for. Shift that first row over one column so that the id is on the same column with the needed values, remove the unnecessary columns, then make row one the column names. Add some pivots and then it should be roughly what you need:
data <- data.frame(var1 = c("Text","A","B","C","D"), var2 = c("Text",NA, 1,0,1), var3 = c("112",NA,NA,NA,NA), var4 = c("Text",1,0,NA, NA), var5 = c(113,NA,NA,NA,NA))
library(dplyr)
library(tidyr)
data2<-data%>%
mutate_all(as.character) #Making character to avoid factor issues
data2[1, 2:(ncol(data2) - 1)] <- data2[1, 3:ncol(data2)] #Shifting first row over one column
data3<-data2%>%
select(-var3,-var5) #Removing the uneeded columns
colnames(data3) <- data3[1,] #Taking the first row and making it the column names
data3 <- data3[-1, ] #removing row 1, since it was made into column names
data3%>%
tidyr::pivot_longer(-Text, names_to = "id", values_to = "time")%>% #Making the data into longer format
tidyr::pivot_wider(names_from = Text, values_from = time) #Then back into wide

You could shift the first row, delete, columns %% 2 and transpose.
data[1, ] <- data[1, -1]
data <- data[c(TRUE, seq_len(ncol(data))[-1] %% 2 == 0)]
setNames(as.data.frame(t(data[, -1]), row.names=FALSE), c('id', data[[1]][-1])) |>
type.convert(as.is=TRUE)
# id A B C D
# 1 112-1 NA 1 0 1
# 2 113-1 1 0 NA NA
BTW, how do you get such data? Maybe you have an x-y-problem.

library(dplyr)
library(tidyr)
library(stringr)
#First rename the columns to more appropriate
n = 2 #Number of pairs of columns you have (here 2)
nam <- do.call(paste0, (expand.grid(c("n_", "id_"), seq(n))))
colnames(data) <- c("col", nam)
#Then, the data manipulation
data %>%
mutate(across(starts_with("id"), ~ first(str_remove(.x, "-")))) %>%
fill(starts_with("id")) %>%
slice(-1) %>%
pivot_longer(-col, names_to = c(".value", "rn"), names_sep = "_") %>%
pivot_wider(names_from = "col", values_from = 'n') %>%
select(-rn)
id A B C D
1 1121 NA 1 0 1
2 1131 1 0 NA NA

Group data by factor level, then transform to data frame with colname being levels?

There is my problem that I can't solve it:
Data:
df <- data.frame(f1=c("a", "a", "b", "b", "c", "c", "c"),
v1=c(10, 11, 4, 5, 0, 1, 2))
data.frame:f1 is factor
f1 v1
a 10
a 11
b 4
b 5
c 0
c 1
c 2
# What I want is:(for example, fetch data with the number of element of some level == 2, then to data.frame)
a b
10 4
11 5
Thanks in advance!

I might be missing something simple here , but the below approach using dplyr works.
library(dplyr)
nlevels = 2
df1 <- df %>%
add_count(f1) %>%
filter(n == nlevels) %>%
select(-n) %>%
mutate(rn = row_number()) %>%
spread(f1, v1) %>%
select(-rn)
This gives
# a b
# <int> <int>
#1 10 NA
#2 11 NA
#3 NA 4
#4 NA 5
Now, if you want to remove NA's we can do
do.call("cbind.data.frame", lapply(df1, function(x) x[!is.na(x)]))
# a b
#1 10 4
#2 11 5
As we have filtered the dataframe which has only nlevels observations, we would have same number of rows for each column in the final dataframe.

split might be useful here to split df$v1 into parts corresponding to df$f1. Since you are always extracting equal length chunks, it can then simply be combined back to a data.frame:
spl <- split(df$v1, df$f1)
data.frame(spl[lengths(spl)==2])
# a b
#1 10 4
#2 11 5
Or do it all in one call by combining this with Filter:
data.frame(Filter(function(x) length(x)==2, split(df$v1, df$f1)))
# a b
#1 10 4
#2 11 5

Here is a solution using unstack :
unstack(
droplevels(df[ave(df$v1, df$f1, FUN = function(x) length(x) == 2)==1,]),
v1 ~ f1)
# a b
# 1 10 4
# 2 11 5
A variant, similar to #thelatemail's solution :
data.frame(Filter(function(x) length(x) == 2, unstack(df,v1 ~ f1)))
My tidyverse solution would be:
library(tidyverse)
df %>%
group_by(f1) %>%
filter(n() == 2) %>%
mutate(i = row_number()) %>%
spread(f1, v1) %>%
select(-i)
# # A tibble: 2 x 2
# a b
# * <dbl> <dbl>
# 1 10 4
# 2 11 5
or mixing approaches :
as_tibble(keep(unstack(df,v1 ~ f1), ~length(.x) == 2))

Using all base functions (but you should use tidyverse)
# Add count of instances
x$len <- ave(x$v1, x$f1, FUN = length)
# Filter, drop the count
x <- x[x$len==2, c('f1','v1')]
# Hacky pivot
result <- data.frame(
lapply(unique(x$f1), FUN = function(y) x$v1[x$f1==y])
)
colnames(result) <- unique(x$f1)
> result
a b
1 10 4
2 11 5

I'd like code this, may it helps for you
library(reshape2)
library(dplyr)
aa = data.frame(v1=c('a','a','b','b','c','c','c'),f1=c(10,11,4,5,0,1,2))
cc = aa %>% group_by(v1) %>% summarise(id = length((v1)))
dd= merge(aa,cc) #get the level
ee = dd[dd$aa==2,] #select number of level equal to 2
ee$id = rep(c(1,2),nrow(ee)/2) # reset index like (1,2,1,2)
dcast(ee, id~v1,value.var = 'f1')
all done!

how to remove NA in summarize_all to summarize mutliple col at the same time?

I have a dataframe like this and want to summarize the mean of every col ignoring NA using dplyr:
df= data.frame('var1'=sample(10,3),'var2'=sample(10,3), 'var3'=c(NA, NA,1), 'var4'=c(2,NA,6))
df %>% summarise_all(mean)
however, this will return NA in col 3 and 4.
How can I pass in na.rm=T?

> df %>% summarise_each(funs(mean(., na.rm = TRUE)))
var1 var2 var3 var4
1 5 5 1 4

You can simply use mean_ from hablar that has na.rm = T as default:
library(hablar)
df %>% summarise_all(mean_)
var1 var2 var3 var4
1 6.666667 4.666667 1 4

you can use IDPmisc library
example
M <- matrix(c(NA,1:7,NA),nrow=3)
M
NaRV.omit(M)

Grouping Over All Possible Combinations of Several Variables With dplyr

Given a situation such as the following
library(dplyr)
myData <- tbl_df(data.frame( var1 = rnorm(100),
var2 = letters[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var3 = LETTERS[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var4 = month.abb[1:3] %>%
sample(100, replace = TRUE) %>%
factor()))
I would like to group `myData' to eventually find summary data grouping by all possible combinations of var2, var3, and var4.
I can create a list with all possible combinations of variables as character values with
groupNames <- names(myData)[2:4]
myGroups <- Map(combn,
list(groupNames),
seq_along(groupNames),
simplify = FALSE) %>%
unlist(recursive = FALSE)
My plan was to make separate data sets for each variable combination with a for() loop, something like
### This Does Not Work
for (i in 1:length(myGroups)){
assign( myGroups[i]%>%
unlist() %>%
paste0(collapse = "")%>%
paste0("Data"),
myData %>%
group_by_(lapply(myGroups[[i]], as.symbol)) %>%
summarise( n = length(var1),
avgVar2 = var2 %>%
mean()))
}
Admittedly I am not very good with lists, and looking up this issue was a bit challenging since dpyr updates have altered how grouping works a bit.
If there is a better way to do this than separate data sets I would love to know.
I've gotten a loop similar to above working when I am only grouping by a single variable.
Any and all help is greatly appreciated! Thank you!

This seems convulated, and there's probably a way to simplify or fancy it up with a do, but it works. Using your myData and myGroups,
results = lapply(myGroups, FUN = function(x) {
do.call(what = group_by_, args = c(list(myData), x)) %>%
summarise( n = length(var1),
avgVar1 = mean(var1))
}
)
> results[[1]]
Source: local data frame [3 x 3]
var2 n avgVar1
1 a 31 0.38929738
2 b 31 -0.07451717
3 c 38 -0.22522129
> results[[4]]
Source: local data frame [9 x 4]
Groups: var2
var2 var3 n avgVar1
1 a A 11 -0.1159160
2 a B 11 0.5663312
3 a C 9 0.7904056
4 b A 7 0.0856384
5 b B 13 0.1309756
6 b C 11 -0.4192895
7 c A 15 -0.2783099
8 c B 10 -0.1110877
9 c C 13 -0.2517602
> results[[7]]
# I won't paste them here, but it has all 27 rows, grouped by var2, var3 and var4.
I changed your summarise call to average var1 since var2 isn't numeric.

I have created a function based on the answer of #Gregor and the comments that followed:
library(magrittr)
myData <- tbl_df(data.frame( var1 = rnorm(100),
var2 = letters[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var3 = LETTERS[1:3] %>%
sample(100, replace = TRUE) %>%
factor(),
var4 = month.abb[1:3] %>%
sample(100, replace = TRUE) %>%
factor()))
Function combSummarise
combSummarise <- function(data, variables=..., summarise=...){
# Get all different combinations of selected variables (credit to #Michael)
myGroups <- lapply(seq_along(variables), function(x) {
combn(c(variables), x, simplify = FALSE)}) %>%
unlist(recursive = FALSE)
# Group by selected variables (credit to #konvas)
df <- eval(parse(text=paste("lapply(myGroups, function(x){
dplyr::group_by_(data, .dots=x) %>%
dplyr::summarize_( \"", paste(summarise, collapse="\",\""),"\")})"))) %>%
do.call(plyr::rbind.fill,.)
groupNames <- c(myGroups[[length(myGroups)]])
newNames <- names(df)[!(names(df) %in% groupNames)]
df <- cbind(df[, groupNames], df[, newNames])
names(df) <- c(groupNames, newNames)
df
}
Call of combSummarise
combSummarise (myData, var=c("var2", "var3", "var4"),
summarise=c("length(var1)", "mean(var1)", "max(var1)"))
or
combSummarise (myData, var=c("var2", "var4"),
summarise=c("length(var1)", "mean(var1)", "max(var1)"))
or
combSummarise (myData, var=c("var2", "var4"),
summarise=c("length(var1)"))
etc

Inspired by the answers by Gregor and dimitris_ps, I wrote a dplyr style function that runs summarise for all combinations of group variables.
summarise_combo <- function(data, ...) {
groupVars <- group_vars(data) %>% map(as.name)
groupCombos <- map( 0:length(groupVars), ~combn(groupVars, ., simplify=FALSE) ) %>%
unlist(recursive = FALSE)
results <- groupCombos %>%
map(function(x) {data %>% group_by(!!! x) %>% summarise(...)} ) %>%
bind_rows()
results %>% select(!!! groupVars, everything())
}
Example
library(tidyverse)
mtcars %>% group_by(cyl, vs) %>% summarise_combo(cyl_n = n(), mean(mpg))

Using unite to create a new column is the simplest way
library(tidyverse)
df = tibble(
a = c(1,1,2,2,1,1,2,2),
b = c(3,4,3,4,3,4,3,4),
val = c(1,2,3,4,5,6,7,8)
)
print(df)#output1
df_2 = unite(df, 'combined_header', a, b, sep='_', remove=FALSE) #remove=F doesn't remove existing columns
print(df_2)#output2
df_2 %>% group_by(combined_header) %>%
summarize(avg_val=mean(val)) %>% print()#output3
#avg 1_3 = mean(1,5)=3 avg 1_4 = mean(2, 6) = 4
RESULTS
Output:
output1
a b val
<dbl> <dbl> <dbl>
1 1 3 1
2 1 4 2
3 2 3 3
4 2 4 4
5 1 3 5
6 1 4 6
7 2 3 7
8 2 4 8
output2
combined_header a b val
<chr> <dbl> <dbl> <dbl>
1 1_3 1 3 1
2 1_4 1 4 2
3 2_3 2 3 3
4 2_4 2 4 4
5 1_3 1 3 5
6 1_4 1 4 6
7 2_3 2 3 7
8 2_4 2 4 8
output3
combined_header avg_val
<chr> <dbl>
1 1_3 3
2 1_4 4
3 2_3 5
4 2_4 6

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Count missing values per class - r

Related

How to do a for loop with case_when

Reshape from unconstructed dataset in r

Group data by factor level, then transform to data frame with colname being levels?

how to remove NA in summarize_all to summarize mutliple col at the same time?

Grouping Over All Possible Combinations of Several Variables With dplyr

Categories

Resources