R: Get column names where value is not null - r

I have a table of 7 columns, the first column is id, then 3 columns of vegetable types and the last 3 columns are fruit types. The values indicate whether a person has this vegetable/ fruit. Is there a way to group the vegetables and the fruits, and output the column names if the person has that vegetable/ fruit?
Input data frame:
id1 <- c("id_1", 1, NA, NA, NA, 1, NA)
id2 <- c("id_2", NA, 1, 1, NA, NA, NA)
input <- data.frame(rbind(id1, id2))
colnames(input) = c("id", "lettuce", "tomato", "bellpeper", "pineapple", "apple", "banana")
Expected output data frame:
output_id1 <- c("id_1", "lettuce", "apple")
output_id2 <- c("id_2", "tomato, bellpeper", NA)
output <- data.frame(rbind(output_id1, output_id2))
colnames(output) <- c("id", "veg", "fruit")

Using the original input data you posted (also shown below in Data) you could do this with the tidyr package:
library(tidyr)
input %>%
tidyr::pivot_longer(cols = matches("^veg|^fruit"),
names_sep = "_",
names_to = c("type", "val"),
values_drop_na = T) %>%
tidyr::pivot_wider(id_cols = id,
names_from = type,
values_from = val,
values_fn = function(x) paste0(x, collapse = ","))
Output
id veg fruit
<chr> <chr> <chr>
1 id_1 lettuce apple
2 id_2 tomato,bellpeper NA
Data
input <- structure(list(id = c("id_1", "id_2"), veg_lettuce = c("1", NA
), veg_tomato = c(NA, "1"), veg_bellpeper = c(NA, "1"), fruit_pineapple = c(NA_character_,
NA_character_), fruit_apple = c("1", NA), fruit_banana = c(NA_character_,
NA_character_)), class = "data.frame", row.names = c("id1", "id2"
))

This should do the trick!
id1 <- c("id_1", 1, NA, NA, NA, 1, NA)
id2 <- c("id_2", NA, 1, 1, 1, NA, NA)
input <- data.frame(rbind(id1, id2))
colnames(input) = c("id", "lettuce", "tomato", "bellpeper", "pineapple", "apple", "banana")
# Remove the id column, it's not necessary
input_without_id <- dplyr::select(input, -c("id"))
# For each row (margin = 1) of the input, return the names vector (names(input))
# but only in the positions the where the row (x!) is not NA
result <- apply(input_without_id, MARGIN = 1, function(x) {
return(names(input_without_id)[which(!is.na(x))])
})
# Rename the result with the corresponding ids originally found in input.
names(result) <- input$id

Here is a tidyverse solution:
library(tidyverse)
input %>%
pivot_longer(-id) %>%
group_by(id) %>%
separate(name, into = c('type', 'class'), sep = "_") %>%
na.omit() %>%
select(-value) %>%
group_by(id, type) %>%
summarise(class = toString(class)) %>%
ungroup() %>%
pivot_wider(names_from = type, values_from = class) %>%
unnest() %>%
select(id, veg, fruit)
This gives us:
# A tibble: 2 x 3
id veg fruit
<chr> <chr> <chr>
1 id_1 lettuce apple
2 id_2 tomato, bellpeper NA

Related

Pivot wider in R with multiple columns

I am having trouble converting a particular dataset from long to wide.
col1 col2
ID 55.
animal. dog
animal bear
animal rabbit
shape. circle
ID 67.
animal. cat
shape. square
As you can see, some IDs have multiple observations for "animal" and so I want to make multiple columns like this:
ID. animal. animal2 animal3 shape
55. dog bear. rabbit circle
67. cat. NA NA square
Any help is appreciated!
Try this solution.
Most of the work was creating an separate ID column and then creating the unique names for the columns.
library(tidyr)
library(dplyr)
library(vctrs)
df<- structure(list(col1 = c("ID", "animal", "animal", "animal", "shape", "ID", "animal", "shape"),
col2 = c("55.", "dog", "bear", "rabbit", "circle", "67.", "cat", "square")),
class = "data.frame", row.names = c(NA, -8L))
#create the ID column
df$ID <- NA
#find the ID rows
idrows <- which(df$col1 == "ID")
#fill column and delete rows
df$ID[idrows] <- df$col2[idrows]
df <- fill(df, ID, .direction = "down")
df <- df[-idrows, ]
#create unique names in each grouping and the pivot wider
df %>% group_by(ID) %>%
mutate(col1=vec_as_names(col1, repair = "unique")) %>%
mutate(col1=stringr::str_replace( col1, "\\.+1", "")) %>%
ungroup() %>%
pivot_wider(id_cols = "ID", names_from = "col1", values_from = "col2")
ID animal animal...2 animal...3 shape
<chr> <chr> <chr> <chr> <chr>
1 55. dog bear rabbit circle
2 67. cat NA NA square
Another alternatives based on one of your previous questions:
df %>% group_by(ID) %>%
mutate(col1 = paste0(col1, data.table::rowid(col1))) %>%
ungroup() %>%
pivot_wider(id_cols = "ID", names_from = "col1", values_from = "col2")
or
df %>%
pivot_wider(id_cols = "ID", names_from = "col1", values_from = "col2") %>%
unnest_wider( "shape", names_sep = "_") %>% unnest_wider( "animal", names_sep = "_")

R - programmatically detect NA columns and return string

I have this vector of eligible columns for my script
cols <- c("country", "phone", "car")
And this dataframe
test <-
data.frame(
id = c(1, 2, 3),
country = c("us", NA, "uk"),
phone = c(1, 1, NA),
car = c(NA, 0, 1)
)
The goal is to create a new column with the result, where the condition will be based only on columns present in cols variable. In case that all values for id are NA, then res should be string nothing, if some of them are not NA, then I need to this colnames, in case that all columns are not NA then result should be string all.
result <-
data.frame(
id = c(1, 2, 3),
country = c("us", NA, NA),
phone = c(1, 1, NA),
car = c(NA, NA, NA),
res = c("country, phone", "phone", "nothing")
)
I can do it only via case_when() function
mutate(
res = case_when(
!is.na(country) & is.na(phone) & is.na(car) ~ "country",
T ~ "?"
)
You can do this in base R (rather than dplyr) using the code:
result$res <- apply(result[,cols],1, function(x){paste(cols[!is.na(x)], collapse=", ")})
result$res[results$res==""] <- "nothing"
The data which you have shared is different (test and result). So we will start with result by removing the res column.
library(dplyr)
result$res <- NULL
result %>%
mutate_all(as.character) %>%
tidyr::pivot_longer(cols = cols) %>%
group_by(id) %>%
summarise(res = toString(name[!is.na(value)])) %>%
type.convert() %>%
left_join(res, by = 'id') %>%
mutate(res = case_when(res == '' ~ 'nothing',
stringr::str_count(result, ',') ==
(length(cols) - 1) ~ 'all',
TRUE ~ as.character(result)))
# A tibble: 3 x 5
# id res country phone car
# <dbl> <chr> <fct> <dbl> <lgl>
#1 1 country, phone us 1 NA
#2 2 phone NA 1 NA
#3 3 nothing NA NA NA
We get the data in long format, get the column names which have non-NA value for each ID. We then change the res column to "all" or "nothing" if there are all or 0 matches respectively.

tidyr::unnest() with different column types

Since the update to tidyr version 1.0.0 I have started to get an error when unnesting a list of dataframes.
The error comes because some of the data frames in the list contain a column with all NA values (logical), while other of the dataframes contain the same column but with some character values (character). The columns with all NA values are coded as logicals while the others are coded as character vectors.
The default behavior of earlier versions of tidyr handled the different column types without problems (at least I didn't get this error when running the script).
Can I solve this issue from inside tidyr::unest() ?
Reproducible example:
library(tidyr)
a <- tibble(
value = rnorm(3),
char_vec = c(NA, "A", NA))
b <- tibble(
value = rnorm(2),
char_vec = c(NA, "B"))
c <- tibble(
value = rnorm(3),
char_vec = c(NA, NA, NA))
tibble(
file = list(a, b, c)) %>%
unnest(cols = c(file))
#> No common type for `..1$file$char_vec` <character> and `..3$file$char_vec`
#> <logical>.
Created on 2019-10-11 by the reprex package (v0.3.0)
You can convert all relevant columns to character one step before unnesting.
tibble(
file = list(a, b, c)) %>%
mutate(file = map(file, ~ mutate(.x, char_vec = as.character(char_vec)))) %>%
unnest(cols = c(file))
If there are several columns that need treatment you can do:
tibble(
file = list(a, b, c)) %>%
mutate(file = map(file, ~ mutate_at(.x, vars(starts_with("char")), ~as.character(.))))
Data for the latter example:
a <- tibble(
value = rnorm(3),
char_vec = c(NA, "A", NA),
char_vec2 = c(NA, NA, NA))
b <- tibble(
value = rnorm(2),
char_vec = c(NA, "B"),
char_vec2 = c("C", "A"))
c <- tibble(
value = rnorm(3),
char_vec = c(NA, NA, NA),
char_vec2 = c("B", NA, "A"))

How can I convert data frame of survey responses to a frequency table?

I have an R dataframe of survey results. Each column is a response to a question on the survey. It can take values 1 to 10 and NA. I would like turn this into a frequency table.
This is an example of the data I have. I'm pretending the values go from 1 to 3, instead of 1 to 10.
data.frame(
"Person" = c(1,2,3),
"Question1" = c(NA, "1", "1"),
"Question2" = c("1", "2", "3")
)
What I want:
data.frame(
"Question" = c("Question1", "Question2"),
"Frequency of 1" = c(2, 1),
"Frequency of 2" = c(0 , 1),
"Frequency of 3" = c(0, 1)
)
I have tried using likert() from the likert package, but I'm getting fractional results which cannot be correct. Is there a simple solution to this problem?
Here is a solution using the dplyr and purrr packages
library(dplyr)
library(purrr)
data.frame(
"Person" = c(1,2,3),
"Question1" = c(NA, "1", "1"),
"Question2" = c("1", "2", "3")
)
df %>%
select(-Person) %>%
mutate_all(~ factor(.x, levels = as.character(1:10) ) %>% addNA() ) %>%
map(table) %>%
transpose() %>%
map(as.integer) %>%
set_names( ~ paste0("Frequency of ",ifelse(is.na(.), "NA", .))) %>%
as_tibble() %>%
mutate(Question = setdiff(names(df),"Person")) %>%
select(Question,everything(), "Frequency of NA" = `Frequency of ` )
A data.table solution:
require(data.table)
setDT(df)
# Melt data:
df <- melt(df, id.vars = "Person", value.name = "Question")
# Cast data to required structure:
df <- data.frame(dcast(df, variable ~ Question))
# Rename variables and remove NA count (as per Ops question):
names(df)[1] <- "Question"
names(df)[-1] <- gsub("X", "Frequency of ", names(df)[-1])
df$NA. <- NULL
df
# Question Frequency of 1 Frequency of 2 Frequency of 3
#1 Question1 2 0 0
#2 Question2 1 1 1
Or a one line answer:
dcast(melt(setDT(df), id.vars="Person", value.name="Question")[!Question %in% NA][, Question := paste0("Frequency of ", Question)], variable ~ Question)
A different tidyverse possibility could be:
df %>%
gather(Question, val, -Person, na.rm = TRUE) %>%
group_by(Question, val) %>%
summarise(res = length(val)) %>%
ungroup() %>%
mutate(val = paste0("Frequency.of.", val)) %>%
spread(val, res, fill = NA)
Question Frequency.of.1 Frequency.of.2 Frequency.of.3
<chr> <int> <int> <int>
1 Question1 2 NA NA
2 Question2 1 1 1
Here it, first, transforms the data from wide to long format. Second, it calculates the frequencies according the questions. Finally, it creates the "Frequency.of." variables and returns the data to its desired shape.
Or if you want to calculate also the NA values per questions:
df %>%
gather(Question, val, -Person) %>%
group_by(Question, val) %>%
summarise(res = length(val)) %>%
ungroup() %>%
mutate(val = paste0("Frequency.of.", val)) %>%
spread(val, res, fill = NA)
Question Frequency.of.1 Frequency.of.2 Frequency.of.3 Frequency.of.NA
<chr> <int> <int> <int> <int>
1 Question1 2 NA NA 1
2 Question2 1 1 1 NA
This is not the most elegant but might help: df2 is your data set.
Data:
df2<-data.frame(
"Person" = c(1,2,3),
"Question1" = c(NA, "1", "1"),
"Question2" = c("1", "2", "3"),stringsAsFactors = F
)
Target:
EDIT:: You could "automate" as follows
df2[is.na(df2)]<-0 #To allow numeric manipulation
values<-c("1","2","3")
Final_df<-sapply(values,function(val) apply(df2[,-1],2,function(x) sum(x==val)))
Final_df<-as.data.frame(Final_df)
names(Final_df)<-paste0("Frequency of_",1:ncol(Final_df))
This yields:
Frequency of_1 Frequency of_2 Frequency of_3
Question1 2 0 0
Question2 1 1 1

dplyr: Replace NAs and 0s with conditional subgroup means

I'm trying to replace all NAs and 0s in a large dataset with their respective group mean -- computed on the basis of cases that are not NA or 0.
Source: local data frame [174,019 x 3]
Groups: name
student name hours
1 s1 ABC 1.0
2 s1 DEF NA
3 s2 DEF 0.5
4 s3 NA 2.0
5 s3 ABC 2.0
6 s4 GHI 0
This solution using dplyr works as intended, but can this be done in one chain?
avg <- workshops %>%
filter(hours > 0 & !is.na(name)) %>%
group_by(name) %>%
summarize(avg.hours = mean(hours, na.rm = TRUE))
workshops <- workshops %>%
left_join(avg, by = "name") %>%
mutate(hours = if_else(hours > 0, hours, avg.hours, avg.hours)) %>%
select(-avg.hours)
Updated solution
workshop <- workshop %>%
group_by(name) %>%
mutate(hours = ifelse(!is.na(name), replace(hours, hours == 0 | is.na(hours),
mean(`is.na<-`(hours, hours == 0), na.rm = TRUE)), NA))
You can do:
workshop%>%
group_by(name)%>%
mutate(hours=replace(hours,hours==0|is.na(hours),
mean(`is.na<-`(hours,hours==0),na.rm = T)))
Here is an option with na.aggregate from zoo. After grouping by 'name', change the 0's to NA with na_if and apply na.aggregate to replace the missing values with the mean (by default, the FUN parameter is mean)
library(dplyr)
library(zoo)
workshops %>%
group_by(name) %>%
mutate(hours = na.aggregate(na_if(hours, 0)))
data
workshops <- structure(list(student = c("s1", "s1", "s2", "s3", "s3",
"s4"), name = c("ABC", "DEF", "DEF", NA, "ABC", "GHI"),
hours = c(1, NA, 0.5, 2, 2, 0)), .Names = c("student", "name", "hours"),
class = "data.frame", row.names = c("1", "2", "3", "4", "5", "6"))

Resources