sankey diagram in R - data preparation - r

I have the following data frame, where each patient is a row (I am showing only a sample of it):
df = structure(list(firstY = c("N/A", "1", "3a", "3a", "3b", "1",
"2", "1", "5", "3b"), secondY = c("N/A", "1", "2", "3a", "4",
"1", "N/A", "1", "5", "3b"), ThirdY = c("N/A", "1", "N/A", "3b",
"4", "1", "N/A", "1", "N/A", "3b"), FourthY = c("N/A", "1", "N/A",
"3a", "4", "1", "N/A", "1", "N/A", "3a"), FifthY = c("N/A", "1",
"N/A", "2", "5", "1", "N/A", "N/A", "N/A", "3b")), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -10L))
I would like to plot a Sankey diagram, which shows the trajectory over time of each patient, and I know that I have to create nodes and links, but I'm having problems transforming the data to the format necessary to accomplish this. Specifically, the most problematic issue is to count how many patients belong to each trajectory, for example, how many patients went in the first year from stage 1 to 2, and all other combinations.
Any help with the data preparation would be appreciated.
The package Alluvial, although simple to understand, does not cope really well in case there is a lot of data.

It's not very clear what you'd like to achieve, because you do not mention the package you'd like to use, but looking at your data, it seems that this could help, if you could use the alluvial package:
library(alluvial) # sankey plots
library(dplyr) # data manipulation
The alluvial functions can use data in wide form like yours, but it needs a frequency column, so we can create it, then do the plot:
dats_all <- df %>% # data
group_by( firstY, secondY, ThirdY, FourthY, FifthY) %>% # group them
summarise(Freq = n()) # add frequencies
# now plot it
alluvial( dats_all[,1:5], freq=dats_all$Freq, border=NA )
In the other hands, if you'd like to use a specific package, you should specify which.
EDIT
Using network3D is a bit tricky but you can maybe achieve some nice result from this. You need links and nodes, and have them matched, so first we can create the links:
# put your df in two columns, and preserve the ordering in many levels (columns) with paste0
links <- data.frame(source = c(paste0(df$firstY,'_1'),paste0(df$secondY,'_2'),paste0(df$ThirdY,'_3'),paste0(df$FourthY,'_4')),
target = c(paste0(df$secondY,'_2'),paste0(df$ThirdY,'_3'),paste0(df$FourthY,'_4'),paste0(df$FifthY,'_5')))
# now convert as character
links$source <- as.character(links$source)
links$target<- as.character(links$target)
Now the nodes are each element in the link in a unique() way:
nodes <- data.frame(name = unique(c(links$source, links$target)))
Now we need that each nodes has a link (or vice-versa), so we match them and transform in numbers. Note the -1 at the end, because networkD3 is 0 indexes, it means that the numbers (indexes) starts from 0.
links$source <- match(links$source, nodes$name) - 1
links$target <- match(links$target, nodes$name) - 1
links$value <- 1 # add also a value
Now you should be ready to plot your sankey:
sankeyNetwork(Links = links, Nodes = nodes, Source = 'source',
Target = 'target', Value = 'value', NodeID = 'name')

a tidyverse way with networkd3
library(tidyr)
library(dplyr)
library(networkD3)
df <- read.table(header = TRUE, stringsAsFactors = FALSE, text = "
firstY secondY ThirdY FourthY FifthY
N/A N/A N/A N/A N/A
1 1 1 1 1
3a 2 N/A N/A N/A
3a 3a 3b 3a 2
3b 4 4 4 5
1 1 1 1 1
2 N/A N/A N/A N/A
1 1 1 1 N/A
5 5 N/A N/A N/A
3b 3b 3b 3a 3b
")
links <-
df %>%
mutate(row = row_number()) %>% # add a row id
pivot_longer(-row, names_to = "col", values_to = "source") %>% # gather all columns
mutate(col = match(col, names(df))) %>% # convert col names to col ids
mutate(source = paste0(source, '_', col)) %>% # add col id to node names
group_by(row) %>%
mutate(target = lead(source, order_by = col)) %>% # get target from following node in row
ungroup() %>%
filter(!is.na(target)) %>% # remove links from last column in original data
group_by(source, target) %>%
summarise(value = n(), .groups = "drop") # aggregate and count similar links
# create nodes data frame from unque nodes found in links data frame
nodes <- data.frame(id = unique(c(links$source, links$target)),
stringsAsFactors = FALSE)
# remove column id from node names
nodes$name <- sub('_[0-9]*$', '', nodes$id)
# create node ids in links data to the 0-based index of the nodes in the nodes data frame
links$source_id <- match(links$source, nodes$id) - 1
links$target_id <- match(links$target, nodes$id) - 1
sankeyNetwork(Links = links, Nodes = nodes, Source = 'source_id',
Target = 'target_id', Value = 'value', NodeID = 'name')

Using ggforce:
library(ggforce)
library(dplyr)
xx <- df %>%
count(firstY, secondY, ThirdY, FourthY, FifthY, name = "value") %>%
gather_set_data(1:5) %>%
mutate(x = factor(x, levels = colnames(df)))
ggplot(xx, aes(x, id = id, split = y, value = value)) +
geom_parallel_sets(alpha = 0.3, axis.width = 0.1) +
geom_parallel_sets_axes(axis.width = 0.3) +
geom_parallel_sets_labels(colour = "white")

Related

Convert DF variable into multiple string variables not in the df (code Example included)

Having a df such as this:
df <- structure(list(V1 = c("1", "2", "3", "4", "5", "6"),
V2 = c("Cat,Dog","Fish,Bird", "Cat, Fish, Bird", "Cat", "Dog, Bird", "Owl, Dog")),
class = "data.frame",
row.names = c(NA,-6L))
I filter by an specific value, say 1:
df <- df %>% filter(V1 == 1)
How can I save the results of V2 in multiple string variables (not in the df),
say V2_1 = "Cat" and V2_1 = "Dog"
Note that when dealing with V1 == 3 now I have 3 variables V2_1,V2_2,V2_3.
Thx!
You could split the string on "," and get vector.
library(dplyr)
df <- df %>% filter(V1 == 1)
tmp <- unlist(strsplit(df$V2, ','))
If you want data as separate variables like V2_1 , V2_2 you can use list2env.
list2env(setNames(as.list(tmp), paste0('V2_', seq_along(tmp))), .GlobalEnv)

How to retrieve original dataframe after a filtered spread dplyr command

My original dataframe, df2 :
df2 <- data.frame(ID = c("1100455", "1100455", "1100455", "1100455", "1100455", "1100464", "1100464"),
CATEGORIE = c("10110", "10160", "10604", "11220", "10110", "10110","10140"),
RANK = c("1", "1", "1", "1", "0" ,"1", "1"),
MD = c("0", "0", "0", "3", "4", "0", "0" ),
PROD3 = c(2345.00,1114.58,501.40,0.00,0.00,2720.00,636.80),
VALUE3 = c(540.00,0.00,0.00,0.00,0.00,0.00,0.00),
AREA3 = c(563.76,0.00,17.35,0.00,0.00,0.00,0.00),
LONG3 = c(4100,2100,1740,265,0,3978,940)
)
I run this command to spread some news columns with value according to PROD3, and name according to MD, it works great but I lose all the others row :
df3 <- df2 %>%
filter(CATEGORIE == "10110") %>%
mutate(name = str_c("aaaa", MD, "aaa", sep = "_"),
value = PROD3) %>%
spread(name, value)
df3
How can I keep all the rows of the df2 in df3 ?
I assume you lose all the rows due to the filter which you apply to make your spread conditional.
There are two ways around this:
1) Do not filter!
Instead of using filter, try to use a more conditional approach like this:
df3 <- df2 %>%
mutate(name = case_when(
CATEGORIE == "10110" ~ str_c("aaaa", MD, "aaa", sep = "_"),
TRUE ~ NA_character_),
value = PROD3) %>%
spread(name, value)
2) Save intermediary results
You could also use your original df3 to amend df2 via a join
df3 <- df2 %>%
left_join(df3, by = "ID")

How can I convert data frame of survey responses to a frequency table?

I have an R dataframe of survey results. Each column is a response to a question on the survey. It can take values 1 to 10 and NA. I would like turn this into a frequency table.
This is an example of the data I have. I'm pretending the values go from 1 to 3, instead of 1 to 10.
data.frame(
"Person" = c(1,2,3),
"Question1" = c(NA, "1", "1"),
"Question2" = c("1", "2", "3")
)
What I want:
data.frame(
"Question" = c("Question1", "Question2"),
"Frequency of 1" = c(2, 1),
"Frequency of 2" = c(0 , 1),
"Frequency of 3" = c(0, 1)
)
I have tried using likert() from the likert package, but I'm getting fractional results which cannot be correct. Is there a simple solution to this problem?
Here is a solution using the dplyr and purrr packages
library(dplyr)
library(purrr)
data.frame(
"Person" = c(1,2,3),
"Question1" = c(NA, "1", "1"),
"Question2" = c("1", "2", "3")
)
df %>%
select(-Person) %>%
mutate_all(~ factor(.x, levels = as.character(1:10) ) %>% addNA() ) %>%
map(table) %>%
transpose() %>%
map(as.integer) %>%
set_names( ~ paste0("Frequency of ",ifelse(is.na(.), "NA", .))) %>%
as_tibble() %>%
mutate(Question = setdiff(names(df),"Person")) %>%
select(Question,everything(), "Frequency of NA" = `Frequency of ` )
A data.table solution:
require(data.table)
setDT(df)
# Melt data:
df <- melt(df, id.vars = "Person", value.name = "Question")
# Cast data to required structure:
df <- data.frame(dcast(df, variable ~ Question))
# Rename variables and remove NA count (as per Ops question):
names(df)[1] <- "Question"
names(df)[-1] <- gsub("X", "Frequency of ", names(df)[-1])
df$NA. <- NULL
df
# Question Frequency of 1 Frequency of 2 Frequency of 3
#1 Question1 2 0 0
#2 Question2 1 1 1
Or a one line answer:
dcast(melt(setDT(df), id.vars="Person", value.name="Question")[!Question %in% NA][, Question := paste0("Frequency of ", Question)], variable ~ Question)
A different tidyverse possibility could be:
df %>%
gather(Question, val, -Person, na.rm = TRUE) %>%
group_by(Question, val) %>%
summarise(res = length(val)) %>%
ungroup() %>%
mutate(val = paste0("Frequency.of.", val)) %>%
spread(val, res, fill = NA)
Question Frequency.of.1 Frequency.of.2 Frequency.of.3
<chr> <int> <int> <int>
1 Question1 2 NA NA
2 Question2 1 1 1
Here it, first, transforms the data from wide to long format. Second, it calculates the frequencies according the questions. Finally, it creates the "Frequency.of." variables and returns the data to its desired shape.
Or if you want to calculate also the NA values per questions:
df %>%
gather(Question, val, -Person) %>%
group_by(Question, val) %>%
summarise(res = length(val)) %>%
ungroup() %>%
mutate(val = paste0("Frequency.of.", val)) %>%
spread(val, res, fill = NA)
Question Frequency.of.1 Frequency.of.2 Frequency.of.3 Frequency.of.NA
<chr> <int> <int> <int> <int>
1 Question1 2 NA NA 1
2 Question2 1 1 1 NA
This is not the most elegant but might help: df2 is your data set.
Data:
df2<-data.frame(
"Person" = c(1,2,3),
"Question1" = c(NA, "1", "1"),
"Question2" = c("1", "2", "3"),stringsAsFactors = F
)
Target:
EDIT:: You could "automate" as follows
df2[is.na(df2)]<-0 #To allow numeric manipulation
values<-c("1","2","3")
Final_df<-sapply(values,function(val) apply(df2[,-1],2,function(x) sum(x==val)))
Final_df<-as.data.frame(Final_df)
names(Final_df)<-paste0("Frequency of_",1:ncol(Final_df))
This yields:
Frequency of_1 Frequency of_2 Frequency of_3
Question1 2 0 0
Question2 1 1 1

dplyr: Replace NAs and 0s with conditional subgroup means

I'm trying to replace all NAs and 0s in a large dataset with their respective group mean -- computed on the basis of cases that are not NA or 0.
Source: local data frame [174,019 x 3]
Groups: name
student name hours
1 s1 ABC 1.0
2 s1 DEF NA
3 s2 DEF 0.5
4 s3 NA 2.0
5 s3 ABC 2.0
6 s4 GHI 0
This solution using dplyr works as intended, but can this be done in one chain?
avg <- workshops %>%
filter(hours > 0 & !is.na(name)) %>%
group_by(name) %>%
summarize(avg.hours = mean(hours, na.rm = TRUE))
workshops <- workshops %>%
left_join(avg, by = "name") %>%
mutate(hours = if_else(hours > 0, hours, avg.hours, avg.hours)) %>%
select(-avg.hours)
Updated solution
workshop <- workshop %>%
group_by(name) %>%
mutate(hours = ifelse(!is.na(name), replace(hours, hours == 0 | is.na(hours),
mean(`is.na<-`(hours, hours == 0), na.rm = TRUE)), NA))
You can do:
workshop%>%
group_by(name)%>%
mutate(hours=replace(hours,hours==0|is.na(hours),
mean(`is.na<-`(hours,hours==0),na.rm = T)))
Here is an option with na.aggregate from zoo. After grouping by 'name', change the 0's to NA with na_if and apply na.aggregate to replace the missing values with the mean (by default, the FUN parameter is mean)
library(dplyr)
library(zoo)
workshops %>%
group_by(name) %>%
mutate(hours = na.aggregate(na_if(hours, 0)))
data
workshops <- structure(list(student = c("s1", "s1", "s2", "s3", "s3",
"s4"), name = c("ABC", "DEF", "DEF", NA, "ABC", "GHI"),
hours = c(1, NA, 0.5, 2, 2, 0)), .Names = c("student", "name", "hours"),
class = "data.frame", row.names = c("1", "2", "3", "4", "5", "6"))

dplyr: Build set of items in list column

I want a column that tracks which items are included in a set based on a predicate. It seems like I should be able to do this with some combination of the purrr accumulate function and the dplyr lead/lag and union/setdiff functions.
This is probably best expressed as a reprex:
input_df <- dplyr::data_frame(user = c("1", "1", "1", "1"),
item = c("a", "b", "a", "a"),
include = c(TRUE, TRUE, FALSE, TRUE))
output_df <- dplyr::data_frame(user = c("1", "1", "1", "1"),
set = list(
c("a"),
c("a", "b"),
c("b"),
c("a", "b")))
Edit: I'm very close. I need to find a way of finding the "bag difference" (instead of the set difference) between vectors in case a user includes, excludes and then re-includes an item.
numbered_input_df <- input_df %>%
mutate(id = row_number())
include_df <- numbered_input_df %>%
filter(include == TRUE) %>%
mutate(include_set = purrr::accumulate(item, c)) %>%
select(user, id, include_set)
exclude_df <- numbered_input_df %>%
filter(include == FALSE) %>%
mutate(exclude_set = purrr::accumulate(item, c)) %>%
select(user, id, exclude_set)
numbered_input_df %>%
left_join(include_df) %>%
left_join(exclude_df) %>%
fill(include_set, exclude_set) %>%
mutate(set = map2(include_set, exclude_set, ~.x[! .x %in% .y]))
Define Update which takes the union or setdiff of the basket with the ith item and use Reduce to apply it to each i. Use ave to do all that by user. No packages are used.
Update <- function(basket, i) with(input_df[i, ],
(if (include) union else setdiff)(basket, item)
)
n <- nrow(input_df)
reduce_user <- function(ix) Reduce(Update, init = NULL, ix, accumulate = TRUE)[-1]
transform(input_df["user"], set = I(ave(as.list(1:n), user, FUN = reduce_user)))
giving:
user set
1 1 a
2 1 a, b
3 1 b
4 1 b, a
Alternately, translating the above to dplyr and purrr and making use of Update from above we get the code below.
library(dplyr)
library(purrr)
input_df %>%
mutate(ix = 1:n()) %>%
group_by(user) %>%
mutate(set = accumulate(ix, Update, .init = NULL)[-1]) %>%
ungroup %>%
select(user, set)
(Note that the only use of purrr is accumulate and that could easily be replaced with Reduce if you want to reduce dependencies.)

Resources