I'm trying to replace all NAs and 0s in a large dataset with their respective group mean -- computed on the basis of cases that are not NA or 0.
Source: local data frame [174,019 x 3]
Groups: name
student name hours
1 s1 ABC 1.0
2 s1 DEF NA
3 s2 DEF 0.5
4 s3 NA 2.0
5 s3 ABC 2.0
6 s4 GHI 0
This solution using dplyr works as intended, but can this be done in one chain?
avg <- workshops %>%
filter(hours > 0 & !is.na(name)) %>%
group_by(name) %>%
summarize(avg.hours = mean(hours, na.rm = TRUE))
workshops <- workshops %>%
left_join(avg, by = "name") %>%
mutate(hours = if_else(hours > 0, hours, avg.hours, avg.hours)) %>%
select(-avg.hours)
Updated solution
workshop <- workshop %>%
group_by(name) %>%
mutate(hours = ifelse(!is.na(name), replace(hours, hours == 0 | is.na(hours),
mean(`is.na<-`(hours, hours == 0), na.rm = TRUE)), NA))
You can do:
workshop%>%
group_by(name)%>%
mutate(hours=replace(hours,hours==0|is.na(hours),
mean(`is.na<-`(hours,hours==0),na.rm = T)))
Here is an option with na.aggregate from zoo. After grouping by 'name', change the 0's to NA with na_if and apply na.aggregate to replace the missing values with the mean (by default, the FUN parameter is mean)
library(dplyr)
library(zoo)
workshops %>%
group_by(name) %>%
mutate(hours = na.aggregate(na_if(hours, 0)))
data
workshops <- structure(list(student = c("s1", "s1", "s2", "s3", "s3",
"s4"), name = c("ABC", "DEF", "DEF", NA, "ABC", "GHI"),
hours = c(1, NA, 0.5, 2, 2, 0)), .Names = c("student", "name", "hours"),
class = "data.frame", row.names = c("1", "2", "3", "4", "5", "6"))
Related
I have a table of 7 columns, the first column is id, then 3 columns of vegetable types and the last 3 columns are fruit types. The values indicate whether a person has this vegetable/ fruit. Is there a way to group the vegetables and the fruits, and output the column names if the person has that vegetable/ fruit?
Input data frame:
id1 <- c("id_1", 1, NA, NA, NA, 1, NA)
id2 <- c("id_2", NA, 1, 1, NA, NA, NA)
input <- data.frame(rbind(id1, id2))
colnames(input) = c("id", "lettuce", "tomato", "bellpeper", "pineapple", "apple", "banana")
Expected output data frame:
output_id1 <- c("id_1", "lettuce", "apple")
output_id2 <- c("id_2", "tomato, bellpeper", NA)
output <- data.frame(rbind(output_id1, output_id2))
colnames(output) <- c("id", "veg", "fruit")
Using the original input data you posted (also shown below in Data) you could do this with the tidyr package:
library(tidyr)
input %>%
tidyr::pivot_longer(cols = matches("^veg|^fruit"),
names_sep = "_",
names_to = c("type", "val"),
values_drop_na = T) %>%
tidyr::pivot_wider(id_cols = id,
names_from = type,
values_from = val,
values_fn = function(x) paste0(x, collapse = ","))
Output
id veg fruit
<chr> <chr> <chr>
1 id_1 lettuce apple
2 id_2 tomato,bellpeper NA
Data
input <- structure(list(id = c("id_1", "id_2"), veg_lettuce = c("1", NA
), veg_tomato = c(NA, "1"), veg_bellpeper = c(NA, "1"), fruit_pineapple = c(NA_character_,
NA_character_), fruit_apple = c("1", NA), fruit_banana = c(NA_character_,
NA_character_)), class = "data.frame", row.names = c("id1", "id2"
))
This should do the trick!
id1 <- c("id_1", 1, NA, NA, NA, 1, NA)
id2 <- c("id_2", NA, 1, 1, 1, NA, NA)
input <- data.frame(rbind(id1, id2))
colnames(input) = c("id", "lettuce", "tomato", "bellpeper", "pineapple", "apple", "banana")
# Remove the id column, it's not necessary
input_without_id <- dplyr::select(input, -c("id"))
# For each row (margin = 1) of the input, return the names vector (names(input))
# but only in the positions the where the row (x!) is not NA
result <- apply(input_without_id, MARGIN = 1, function(x) {
return(names(input_without_id)[which(!is.na(x))])
})
# Rename the result with the corresponding ids originally found in input.
names(result) <- input$id
Here is a tidyverse solution:
library(tidyverse)
input %>%
pivot_longer(-id) %>%
group_by(id) %>%
separate(name, into = c('type', 'class'), sep = "_") %>%
na.omit() %>%
select(-value) %>%
group_by(id, type) %>%
summarise(class = toString(class)) %>%
ungroup() %>%
pivot_wider(names_from = type, values_from = class) %>%
unnest() %>%
select(id, veg, fruit)
This gives us:
# A tibble: 2 x 3
id veg fruit
<chr> <chr> <chr>
1 id_1 lettuce apple
2 id_2 tomato, bellpeper NA
I have this vector of eligible columns for my script
cols <- c("country", "phone", "car")
And this dataframe
test <-
data.frame(
id = c(1, 2, 3),
country = c("us", NA, "uk"),
phone = c(1, 1, NA),
car = c(NA, 0, 1)
)
The goal is to create a new column with the result, where the condition will be based only on columns present in cols variable. In case that all values for id are NA, then res should be string nothing, if some of them are not NA, then I need to this colnames, in case that all columns are not NA then result should be string all.
result <-
data.frame(
id = c(1, 2, 3),
country = c("us", NA, NA),
phone = c(1, 1, NA),
car = c(NA, NA, NA),
res = c("country, phone", "phone", "nothing")
)
I can do it only via case_when() function
mutate(
res = case_when(
!is.na(country) & is.na(phone) & is.na(car) ~ "country",
T ~ "?"
)
You can do this in base R (rather than dplyr) using the code:
result$res <- apply(result[,cols],1, function(x){paste(cols[!is.na(x)], collapse=", ")})
result$res[results$res==""] <- "nothing"
The data which you have shared is different (test and result). So we will start with result by removing the res column.
library(dplyr)
result$res <- NULL
result %>%
mutate_all(as.character) %>%
tidyr::pivot_longer(cols = cols) %>%
group_by(id) %>%
summarise(res = toString(name[!is.na(value)])) %>%
type.convert() %>%
left_join(res, by = 'id') %>%
mutate(res = case_when(res == '' ~ 'nothing',
stringr::str_count(result, ',') ==
(length(cols) - 1) ~ 'all',
TRUE ~ as.character(result)))
# A tibble: 3 x 5
# id res country phone car
# <dbl> <chr> <fct> <dbl> <lgl>
#1 1 country, phone us 1 NA
#2 2 phone NA 1 NA
#3 3 nothing NA NA NA
We get the data in long format, get the column names which have non-NA value for each ID. We then change the res column to "all" or "nothing" if there are all or 0 matches respectively.
I have a data frame where column names are duplicated once. Now I need to combine them to get a proper data set. I can use dplyr select command to extract matching columns and combine them later. However, I wish to achieve it using for loop.
#Example data frame
x <- c(1, NA, 3)
y <- c(1, NA, 4)
x.1 <- c(NA, 3, NA)
y.1 <- c(NA, 5, NA)
data <- data.frame(x, y, x1, y1)
##with `dplyr` I can do like
t1 <- data%>%select(contains("x"))%>%
mutate(x = rowSums(., na.rm = TRUE))%>%
select(x)
t2 <- data%>%select(contains("y"))%>%
mutate(y = rowSums(., na.rm = TRUE))%>%
select(y)
data <- cbind(t1,t2)
This is cumbersome as I have more than 25 similar columns
How to achieve the same result using for loop by matching columns names and perform rowSums. Or even simple approach using dplyr will also help.
We can use split.default to split based on the substring of the column names into a list and then apply the rowSums
library(dplyr)
library(stringr)
library(purrr)
data %>%
split.default(str_remove(names(.), "\\.\\d+")) %>%
map_dfr(rowSums, na.rm = TRUE)
# A tibble: 3 x 2
# x y
# <dbl> <dbl>
#1 1 1
#2 3 5
#3 3 4
If we want to use a for loop
un1 <- unique(sub("\\..*", "", names(data)))
out <- setNames(rep(list(NA), length(un1)), un1)
for(un in un1) {
out[[un]] <- rowSums(data[grep(un, names(data))], na.rm = TRUE)
}
as.data.frame(out)
data
data <- structure(list(x = c(1, NA, 3), y = c(1, NA, 4), x.1 = c(NA,
3, NA), y.1 = c(NA, 5, NA)), class = "data.frame", row.names = c(NA,
-3L))
Using purrr::map_dfc and transmute instead of mutate
library(dplyr)
purrr::map_dfc(c('x','y'), ~data %>% select(contains(.x)) %>%
transmute(!!.x := rowSums(., na.rm = TRUE)))
x y
1 1 1
2 3 5
3 3 4
I have the following data frame, where each patient is a row (I am showing only a sample of it):
df = structure(list(firstY = c("N/A", "1", "3a", "3a", "3b", "1",
"2", "1", "5", "3b"), secondY = c("N/A", "1", "2", "3a", "4",
"1", "N/A", "1", "5", "3b"), ThirdY = c("N/A", "1", "N/A", "3b",
"4", "1", "N/A", "1", "N/A", "3b"), FourthY = c("N/A", "1", "N/A",
"3a", "4", "1", "N/A", "1", "N/A", "3a"), FifthY = c("N/A", "1",
"N/A", "2", "5", "1", "N/A", "N/A", "N/A", "3b")), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -10L))
I would like to plot a Sankey diagram, which shows the trajectory over time of each patient, and I know that I have to create nodes and links, but I'm having problems transforming the data to the format necessary to accomplish this. Specifically, the most problematic issue is to count how many patients belong to each trajectory, for example, how many patients went in the first year from stage 1 to 2, and all other combinations.
Any help with the data preparation would be appreciated.
The package Alluvial, although simple to understand, does not cope really well in case there is a lot of data.
It's not very clear what you'd like to achieve, because you do not mention the package you'd like to use, but looking at your data, it seems that this could help, if you could use the alluvial package:
library(alluvial) # sankey plots
library(dplyr) # data manipulation
The alluvial functions can use data in wide form like yours, but it needs a frequency column, so we can create it, then do the plot:
dats_all <- df %>% # data
group_by( firstY, secondY, ThirdY, FourthY, FifthY) %>% # group them
summarise(Freq = n()) # add frequencies
# now plot it
alluvial( dats_all[,1:5], freq=dats_all$Freq, border=NA )
In the other hands, if you'd like to use a specific package, you should specify which.
EDIT
Using network3D is a bit tricky but you can maybe achieve some nice result from this. You need links and nodes, and have them matched, so first we can create the links:
# put your df in two columns, and preserve the ordering in many levels (columns) with paste0
links <- data.frame(source = c(paste0(df$firstY,'_1'),paste0(df$secondY,'_2'),paste0(df$ThirdY,'_3'),paste0(df$FourthY,'_4')),
target = c(paste0(df$secondY,'_2'),paste0(df$ThirdY,'_3'),paste0(df$FourthY,'_4'),paste0(df$FifthY,'_5')))
# now convert as character
links$source <- as.character(links$source)
links$target<- as.character(links$target)
Now the nodes are each element in the link in a unique() way:
nodes <- data.frame(name = unique(c(links$source, links$target)))
Now we need that each nodes has a link (or vice-versa), so we match them and transform in numbers. Note the -1 at the end, because networkD3 is 0 indexes, it means that the numbers (indexes) starts from 0.
links$source <- match(links$source, nodes$name) - 1
links$target <- match(links$target, nodes$name) - 1
links$value <- 1 # add also a value
Now you should be ready to plot your sankey:
sankeyNetwork(Links = links, Nodes = nodes, Source = 'source',
Target = 'target', Value = 'value', NodeID = 'name')
a tidyverse way with networkd3
library(tidyr)
library(dplyr)
library(networkD3)
df <- read.table(header = TRUE, stringsAsFactors = FALSE, text = "
firstY secondY ThirdY FourthY FifthY
N/A N/A N/A N/A N/A
1 1 1 1 1
3a 2 N/A N/A N/A
3a 3a 3b 3a 2
3b 4 4 4 5
1 1 1 1 1
2 N/A N/A N/A N/A
1 1 1 1 N/A
5 5 N/A N/A N/A
3b 3b 3b 3a 3b
")
links <-
df %>%
mutate(row = row_number()) %>% # add a row id
pivot_longer(-row, names_to = "col", values_to = "source") %>% # gather all columns
mutate(col = match(col, names(df))) %>% # convert col names to col ids
mutate(source = paste0(source, '_', col)) %>% # add col id to node names
group_by(row) %>%
mutate(target = lead(source, order_by = col)) %>% # get target from following node in row
ungroup() %>%
filter(!is.na(target)) %>% # remove links from last column in original data
group_by(source, target) %>%
summarise(value = n(), .groups = "drop") # aggregate and count similar links
# create nodes data frame from unque nodes found in links data frame
nodes <- data.frame(id = unique(c(links$source, links$target)),
stringsAsFactors = FALSE)
# remove column id from node names
nodes$name <- sub('_[0-9]*$', '', nodes$id)
# create node ids in links data to the 0-based index of the nodes in the nodes data frame
links$source_id <- match(links$source, nodes$id) - 1
links$target_id <- match(links$target, nodes$id) - 1
sankeyNetwork(Links = links, Nodes = nodes, Source = 'source_id',
Target = 'target_id', Value = 'value', NodeID = 'name')
Using ggforce:
library(ggforce)
library(dplyr)
xx <- df %>%
count(firstY, secondY, ThirdY, FourthY, FifthY, name = "value") %>%
gather_set_data(1:5) %>%
mutate(x = factor(x, levels = colnames(df)))
ggplot(xx, aes(x, id = id, split = y, value = value)) +
geom_parallel_sets(alpha = 0.3, axis.width = 0.1) +
geom_parallel_sets_axes(axis.width = 0.3) +
geom_parallel_sets_labels(colour = "white")
I have an R dataframe of survey results. Each column is a response to a question on the survey. It can take values 1 to 10 and NA. I would like turn this into a frequency table.
This is an example of the data I have. I'm pretending the values go from 1 to 3, instead of 1 to 10.
data.frame(
"Person" = c(1,2,3),
"Question1" = c(NA, "1", "1"),
"Question2" = c("1", "2", "3")
)
What I want:
data.frame(
"Question" = c("Question1", "Question2"),
"Frequency of 1" = c(2, 1),
"Frequency of 2" = c(0 , 1),
"Frequency of 3" = c(0, 1)
)
I have tried using likert() from the likert package, but I'm getting fractional results which cannot be correct. Is there a simple solution to this problem?
Here is a solution using the dplyr and purrr packages
library(dplyr)
library(purrr)
data.frame(
"Person" = c(1,2,3),
"Question1" = c(NA, "1", "1"),
"Question2" = c("1", "2", "3")
)
df %>%
select(-Person) %>%
mutate_all(~ factor(.x, levels = as.character(1:10) ) %>% addNA() ) %>%
map(table) %>%
transpose() %>%
map(as.integer) %>%
set_names( ~ paste0("Frequency of ",ifelse(is.na(.), "NA", .))) %>%
as_tibble() %>%
mutate(Question = setdiff(names(df),"Person")) %>%
select(Question,everything(), "Frequency of NA" = `Frequency of ` )
A data.table solution:
require(data.table)
setDT(df)
# Melt data:
df <- melt(df, id.vars = "Person", value.name = "Question")
# Cast data to required structure:
df <- data.frame(dcast(df, variable ~ Question))
# Rename variables and remove NA count (as per Ops question):
names(df)[1] <- "Question"
names(df)[-1] <- gsub("X", "Frequency of ", names(df)[-1])
df$NA. <- NULL
df
# Question Frequency of 1 Frequency of 2 Frequency of 3
#1 Question1 2 0 0
#2 Question2 1 1 1
Or a one line answer:
dcast(melt(setDT(df), id.vars="Person", value.name="Question")[!Question %in% NA][, Question := paste0("Frequency of ", Question)], variable ~ Question)
A different tidyverse possibility could be:
df %>%
gather(Question, val, -Person, na.rm = TRUE) %>%
group_by(Question, val) %>%
summarise(res = length(val)) %>%
ungroup() %>%
mutate(val = paste0("Frequency.of.", val)) %>%
spread(val, res, fill = NA)
Question Frequency.of.1 Frequency.of.2 Frequency.of.3
<chr> <int> <int> <int>
1 Question1 2 NA NA
2 Question2 1 1 1
Here it, first, transforms the data from wide to long format. Second, it calculates the frequencies according the questions. Finally, it creates the "Frequency.of." variables and returns the data to its desired shape.
Or if you want to calculate also the NA values per questions:
df %>%
gather(Question, val, -Person) %>%
group_by(Question, val) %>%
summarise(res = length(val)) %>%
ungroup() %>%
mutate(val = paste0("Frequency.of.", val)) %>%
spread(val, res, fill = NA)
Question Frequency.of.1 Frequency.of.2 Frequency.of.3 Frequency.of.NA
<chr> <int> <int> <int> <int>
1 Question1 2 NA NA 1
2 Question2 1 1 1 NA
This is not the most elegant but might help: df2 is your data set.
Data:
df2<-data.frame(
"Person" = c(1,2,3),
"Question1" = c(NA, "1", "1"),
"Question2" = c("1", "2", "3"),stringsAsFactors = F
)
Target:
EDIT:: You could "automate" as follows
df2[is.na(df2)]<-0 #To allow numeric manipulation
values<-c("1","2","3")
Final_df<-sapply(values,function(val) apply(df2[,-1],2,function(x) sum(x==val)))
Final_df<-as.data.frame(Final_df)
names(Final_df)<-paste0("Frequency of_",1:ncol(Final_df))
This yields:
Frequency of_1 Frequency of_2 Frequency of_3
Question1 2 0 0
Question2 1 1 1