Convert multiple columns that are onehot encoded into a factor column - r

Some columns in the dataset are one hot encoded. I wish to convert them into one factor column.
I wish to write a code where I specify which columns to be combined and converted to factor column.
Below is an example with desired result.
library(tidyverse)
tbl <- tibble(
# one hot encoded
a1_blue = c(1, 0, 0),
a1_red = c(0, 1, 0),
a1_green = c(0, 0, 1),
# one hot encoded
a2_square = c(1, 0, 0),
a2_circle = c(0, 1, 0),
a2_dot = c(0, 0, 1),
a3_letters = factor(c("A", "B", "C"))
)
tbl_desired <- tibble(
a1_colors = factor(c("blue", "red", "green"),
levels = c("blue", "red", "green")),
a2_shapes = factor(c("square", "circle", "dot"),
levels = c("square", "circle", "dot")),
a3_letters = factor(c("A", "B", "C"))
)

This will give you the structure that you need. You can convert the columns to factors using mutate(across(everything(), as.factor)).
tbl %>%
pivot_longer(-a3_letters) %>%
filter(value != 0) %>%
separate(name, into = c("var", "val")) %>%
pivot_wider(-value, values_from = val, names_from = var)
#> # A tibble: 3 x 3
#> a3_letters a1 a2
#> <fct> <chr> <chr>
#> 1 A blue square
#> 2 B red circle
#> 3 C green dot

Related

highlight cell if previous column meets specific condition R

I have a dataframe
library(flextable)
df = structure(list(col1 = c(1, NA, 1, 1, 1), col2 = c(NA, 1, NA,
1, 1), col3 = c(1, 1, NA, 1, NA), col4 = c(1, 1, 1, 1, NA)), class = "data.frame", row.names = c(NA,
-5L))
df %>% flextable()
I want
to return the last 3 columns highlighted based on the following logic:
red if it is blank
green if and only if the preceeding column was blank.
Based on this, I am trying to create a color matrix to identify the green highlights, but have hit a brick wall.
To identify the red matrix, I used the following code ifelse(is.na(df),"red","").
what would be the best method to identify the green labels
Not the prettiest, but works
df=data.frame(col1 = c(1,NA,1,1,1,1),
col2 = c(NA,1,NA,1,1,1),
col3 = c(1,1,NA,1, NA,1),
col4 = c(1,1,1,1,NA,1))
df %>% flextable()
red = ifelse(is.na(df),1,0)
green = data.frame()
for(n in 1:(ncol(red)-1)){
print(n)
r=ifelse(red[,n]==1 & red[,n+1] == 0,1,0)
green = rbind(green, r)
}
green = t(green)
colnames(green) = paste0("col",2:4)
green
red[,2:4]
ft = df[,2:4] %>%
flextable() %>%
bg(i = ~ is.na(col2), j = 1,bg='red') %>%
bg(i = ~ is.na(col3), j = 2,bg='red') %>%
bg(i = ~ is.na(col4), j = 3,bg='red') %>%
bg(i = ~ green[,1]==1,j = 1, bg='green') %>%
bg(i = ~ green[,2]==1,j = 2, bg='green') %>%
bg(i = ~ green[,3]==1,j = 3, bg='green')
ft

How can I pull frequency data from multiple columns to make a bar chart?

Essentially, I am trying to take make a bar chart using dplyr where there are several columns, say A, B, and C
Each column has a value classifying it, 0 or 1, if the row corresponds to that type of value
I am trying to make a bar chart using ggplot that shows the number of rows that contain a true value in each column. Any advice, at least on the syntax I'd follow?
Example:
A 1 1 1 0 0 0
B 0 0 0 1 0 0
C 0 0 0 0 1 1
I want to show the frequency of each, but as if those three were columns
Edit: I should note that I am trying to pull these from a larger data set, e.x. A, B, C, D, E, F, G, H.... but I only want A, B, and C
Try this
library(dplyr)
library(ggplot2)
library(tibble)
df <- as.data.frame(
rbind(
A = c(1, 1, 1, 0, 0, 0),
B = c(0, 0, 0, 1, 0, 0),
C = c(0, 0, 0, 0, 1, 1),
D = c(0, 0, 0, 0, 0, 0),
E = c(0, 0, 0, 0, 0, 0)
))
df %>%
# NOTE: name of id variable should not start with "v" or "V"
# Otherwise the select will not work.
rownames_to_column(var = "type") %>%
mutate(count = rowSums(select(., starts_with("V")), na.rm = TRUE)) %>%
select(type, count) %>%
filter(type %in% c("A", "B", "C")) %>%
ggplot(aes(type, count, fill = type)) +
geom_col() +
guides(fill = FALSE)
Created on 2020-03-15 by the reprex package (v0.3.0)
Update
First of all, both the solution by #Chris and by #Jonathan are much cleaner and clearer than my approach and both are more efficinet. In terms of efficiency the base R solution by #Chris is however by far the most efficient (not only in terms of programmers efficiency (;). Results show that the base R solution gives a speedup compared to the tidyverse solutions by factor ~10. Whether this is crucial depends on the size of the dataset or ...
Here are the results:
I simply put the different solutions in functions (I only did some renaming) and did a microbenchmark. I also added a fourth function which adpats the code by #Chris to allow for flexible names.
library(dplyr)
library(tidyr)
library(ggplot2)
library(tibble)
# example data
df <- as.data.frame(
rbind(
A = c(1, 1, 1, 0, 0, 0),
B = c(0, 0, 0, 1, 0, 0),
C = c(0, 0, 0, 0, 1, 1),
D = c(0, 0, 0, 0, 0, 0),
E = c(0, 0, 0, 0, 0, 0)
))
# Tidyverse 1 using select & rowSums
sum_rows1 <- function(df) {
df %>%
# NOTE: name of id variable should not start with "v" or "V"
# Otherwise the select will not work.
rownames_to_column(var = "type") %>%
filter(type %in% c("A", "B", "C")) %>%
mutate(count = rowSums(select(., starts_with("V")), na.rm = TRUE)) %>%
select(type, count)
}
# Tidyverse 2 using pivot_longer
sum_rows2 <- function(df) {
df %>%
#Transpose the data
t() %>%
#Convert it as data.frame
as.data.frame() %>%
#Get data from wide to long format
pivot_longer(cols = everything(),
names_to = "type",
values_to = "value") %>%
#Filter to stay only with letters A, B, C
filter(type %in% c("A","B","C")) %>%
#group by var (i.e., letters)
group_by(type) %>%
#Get the sum of values per letter
summarize(count = sum(value))
}
# base R 1 with fixed names
sum_rows3 <- function(df) {
sum1 <- apply(t(df)[,1:3], 2, sum)
data.frame(type = LETTERS[1:3], count = sum1)
}
# base R 2 with flexible names
sum_rows4 <- function(df, cols) {
sum1 <- apply(t(df)[, cols], 2, sum)
data.frame(type = names(sum1), count = sum1)
}
(df1 <- sum_rows1(df))
#> type count
#> 1 A 3
#> 2 B 1
#> 3 C 2
(df2 <- sum_rows2(df))
#> # A tibble: 3 x 2
#> type count
#> <chr> <dbl>
#> 1 A 3
#> 2 B 1
#> 3 C 2
(df3 <- sum_rows3(df))
#> type count
#> A A 3
#> B B 1
#> C C 2
(df4 <- sum_rows4(df, c("A","B","C")))
#> type count
#> A A 3
#> B B 1
#> C C 2
# Benchmark the solutions
microbenchmark::microbenchmark(sum_rows1(df), sum_rows2(df), sum_rows3(df), sum_rows4(df, c("A","B","C")))
#> Unit: microseconds
#> expr min lq mean median uq
#> sum_rows1(df) 4239.5 4619.60 6079.313 6072.20 6771.15
#> sum_rows2(df) 3658.1 4085.55 5309.038 5225.95 5939.90
#> sum_rows3(df) 301.6 383.15 540.001 437.55 539.10
#> sum_rows4(df, c("A", "B", "C")) 302.6 387.05 533.977 469.05 546.40
#> max neval
#> 11238.7 100
#> 13808.2 100
#> 5018.6 100
#> 4106.9 100
Created on 2020-03-16 by the reprex package (v0.3.0)
Here is another solution using tidyverse that uses two great functions (pivot_longer and summarize) to organize the data and build the desired plot.
library(tidyverse)
df %>%
#Transpose the data
t() %>%
#Convert it as data.frame
as.data.frame() %>%
#Get data from wide to long format
pivot_longer(cols = everything(),
names_to = "var",
values_to = "value") %>%
#Filter to stay only with letters A, B, C
filter(var %in% c("A","B","C")) %>%
#group by var (i.e., letters)
group_by(var) %>%
#Get the sum of values per letter
summarize(sum = sum(value)) %>%
#ggplot with geom_col (i.e., columns plot)
ggplot(aes(x = var,
y = sum,
fill = var)) +
geom_col()
A simple base R solution is this, using #stefan's data:
First, calculate the sums for each row in df by transposing it (flipping rows into columns and vice versa) using tas well as apply, 2 for the rows in df that have become columns in t(df), and sum for sums:
sum1 <- apply(t(df)[,1:3], 2, sum)
Then create a dataframe with the relevant sequence of upper-case letters as the first variable and sum1 as the second variable:
sum2 <- data.frame(types = LETTERS[1:3], sum1)
And finally plot your barplot using sum2as input data:
ggplot(sum2, aes(types, sum1, fill = types)) +
geom_col(fill = c("#009E00", "#F0E300", "#0066B2"))

gather 3 different detections of three different variables

I have a dataframe of 96074 obs. of 31 variables.
the first two variables are id and the date, then I have 9 columns with measurement (three different KPIs with three different time properties), then various technical and geographical variables.
df <- data.frame(
id = rep(1:3, 3),
time = rep(as.Date('2009-01-01') + 0:2, each = 3),
sum_d_1day_old = rnorm(9, 2, 1),
sum_i_1day_old = rnorm(9, 2, 1),
per_i_d_1day_old = rnorm(9, 0, 1),
sum_d_5days_old = rnorm(9, 0, 1),
sum_i_5days_old = rnorm(9, 0, 1),
per_i_d_5days_old = rnorm(9, 0, 1),
sum_d_15days_old = rnorm(9, 0, 1),
sum_i_15days_old = rnorm(9, 0, 1),
per_i_d_15days_old = rnorm(9, 0, 1)
)
I want to transform from wide to long, in order to do graphs with ggplot using facets for example.
If I had a df with just one variable with its three-time scans I would have no problem in using gather:
plotdf <- df %>%
gather(sum_d, value,
c(sum_d_1day_old, sum_d_5days_old, sum_d_15days_old),
factor_key = TRUE)
But having three different variables trips me up.
I would like to have this output:
plotdf <- data.frame(
id = rep(1:3, 3),
time = rep(as.Date('2009-01-01') + 0:2, each = 3),
sum_d = rep(c("sum_d_1day_old", "sum_d_5days_old", "sum_d_15days_old"), 3),
values_sum_d = rnorm(9, 2, 1),
sum_i = rep(c("sum_i_1day_old", "sum_i_5days_old", "sum_i_15days_old"), 3),
values_sum_i = rnorm(9, 2, 1),
per_i_d = rep(c("per_i_d_1day_old", "per_i_d_5days_old", "per_i_d_15days_old"), 3),
values_per_i_d = rnorm(9, 2, 1)
)
with id, sum_d, sum_i and per_i_d of class factor time of class Date and the values of class numeric (I have to add that I don't have negative measures in these variables).
what I've tried to do:
plotdf <- gather(df, key, value, sum_d_1day_old:per_i_d_15days_old, factor_key = TRUE)
gathering all of the variables in a single column
plotdf$KPI <- paste(sapply(strsplit(as.character(plotdf$key), "_"), "[[", 1),
sapply(strsplit(as.character(plotdf$key), "_"), "[[", 2), sep = "_")
creating a new column with the name of the KPI, without the time specification
plotdf %>% unite(value2, key, value) %>%
#creating a new variable with the full name of the KPI attaching the value at the end
mutate(i = row_number()) %>% spread(KPI, value2) %>% select(-i)
#spreading
But spread creates rows with NAs.
To replace then at first I used
group_by(id, date) %>%
fill(c(sum_d, sum_i, per_i_d), .direction = "down") %>%
fill(c(sum_d, sum_i, per_i_d), .direction = "up") %>%
But the problem is that there are already some measurements with NAs in the original df in the variable per_i_d (44 in total), so I lose that information.
I thought that I could replace the NAs in the original df with a dummy value and then replace the NAs back, but then I thought that there could be a more efficient solution for all of my problem.
After I replaced the NAs, my idea was to use slice(1) to select only the first row of each couple id/date, then do some manipulation with separate/unite to have the output I desired.
I actually did that, but then I remembered I had those aforementioned NAs in the original df.
df %>%
gather(key,value,-id,-time) %>%
mutate(type = str_extract(key,'[a-z]+_[a-z]'),
age = str_extract(key, '[0-9]+[a-z]+_[a-z]+')) %>%
select(-key) %>%
spread(type,value)
gives
id time age per_i sum_d sum_i
1 1 2009-01-01 15days_old 0.8132301 0.8888928 0.077532040
2 1 2009-01-01 1day_old -2.0993199 2.8817133 3.047894196
3 1 2009-01-01 5days_old -0.4626151 -1.0002926 0.327102000
4 1 2009-01-02 15days_old 0.4089618 -1.6868523 0.866412133
5 1 2009-01-02 1day_old 0.8181313 3.7118065 3.701018419
...
EDIT:
adding non-value columns to the dataframe:
df %>%
gather(key,value,-id,-time) %>%
mutate(type = str_extract(key,'[a-z]+_[a-z]'),
age = str_extract(key, '[0-9]+[a-z]+_[a-z]+'),
info = paste(age,type,sep = "_")) %>%
select(-key) %>%
gather(key,value,-id,-time,-age,-type) %>%
unite(dummy,type,key) %>%
spread(dummy,value)

R group by substring

Sample data
data = data.frame(id = c(1, 2, 3, 4, 5),
name = c("blue", "green", "red", "read", "HUE"),
WANT = c("ue", "re", "re", "re", "ue"))
To explain. If 'name' contains "ue", then WANT = "ue" and if 'name' contains 're' then WANT = "re". Capitalization does not matter.
This is my attempt:
df$attempt <- NA
df$attempt[substr(df$name) == "ue"] <- "ue"
df$attempt[substr(df$name) == "re"] <- "re"
A solution using stringr (part of the tidyverse).
library(tidyverse)
data2 <- data %>%
mutate(attempt = str_extract(name, pattern = regex("ue|re", ignore_case = TRUE)),
attempt = str_to_lower(attempt))
data2
# id name WANT attempt
# 1 1 blue ue ue
# 2 2 green re re
# 3 3 red re re
# 4 4 read re re
# 5 5 HUE ue ue
DATA
data = data.frame(id = c(1, 2, 3, 4, 5),
name = c("blue", "green", "red", "read", "HUE"),
WANT = c("ue", "re", "re", "re", "ue"))
Here is a couple of versions
data = data.frame(id = c(1, 2, 3, 4, 5),
name = c("blue", "green", "red", "read", "HUE"))
#base r version
data$want <- ifelse(grepl("ue", data$name, ignore.case = T), "ue",
ifelse(grepl("re", data$name, ignore.case = T), "re",
NA))
#tidyverse version
library(dplyr)
data <- data %>%
mutate(want = ifelse(grepl("ue", name, ignore.case = T), "ue",
ifelse(grepl("re", name, ignore.case = T), "re",
NA)))
Try using ifelse and mutate. grepl("ue",name,ignore.case = T) checks if ue or UE exists. Same logic applies to [re]
library(dplyr)
data = data%>%
mutate(Attempt = ifelse(grepl("ue",name,ignore.case = T),"ue",
ifelse(grepl("re",name,ignore.case = T),"re",NA)))
With purrr and dplyr:
library(dplyr)
library(purrr)
data %>%
mutate(group = map2_chr(WANT, name, ~ .x[grepl(.x, .y, ignore.case = TRUE)]))
Output:
id name WANT group
1 1 blue ue ue
2 2 green re re
3 3 red re re
4 4 read re re
5 5 HUE hu hu
Data:
data = data.frame(id = c(1, 2, 3, 4, 5),
name = c("blue", "green", "red", "read", "HUE"),
WANT = c("ue", "re", "re", "re", "hu"),
stringsAsFactors = FALSE)

R: Tidy Aggregation of Sequence Data and Visualization of Stepfunctions

I have some patient data, where the individual patients change treatment groups over time. My goal is to visualize the sequence of group changes and aggregate this data into a "sequence profile" for each treatment group.
For each treatment group I would like to show, when it generally occurs
in the treatment cycle (say rather in the beginning or in the end). To account for the differing sequence length, I would like to standardize these profiles betweenn 0 (very beginning) and 1 (end).
I would like to find an efficient data preparation and visualization.
Mininmal Example
Structure of Data
library(dplyr)
library(purrr)
library(ggplot2)
# minimal data
cj_df_raw <- tibble::tribble(
~id, ~group,
1, "A",
1, "B",
2, "A",
2, "B",
2, "A"
)
# compute "intervals" for each person [start, end]
cj_df_raw %>%
group_by(id) %>%
mutate(pos = row_number(),
len = length(id),
start = (pos - 1) / len,
end = pos / len) %>%
filter(group == "A")
#> # A tibble: 3 x 6
#> # Groups: id [2]
#> id group pos len start end
#> <dbl> <chr> <int> <int> <dbl> <dbl>
#> 1 1 A 1 2 0 0.5
#> 2 2 A 1 3 0 0.333
#> 3 2 A 3 3 0.667 1
(So Id 1 was in group A in the first 50% of their sequence, and Id 2 was in Group A in the first 33% and the last 33% of their sequence. This means, that 2 Ids where between 0-33% of the sequence, 1 between 33-50%, 0 between 50-66% and 1 above 66%.)
This is the result I would like to achieve and I miss a chance to transform my data effectively.
Desired outcome
profile_treatmen_a <- tibble::tribble(
~x, ~y,
0, 0L,
0.33, 2L,
0.5, 1L,
0.66, 0L,
1, 1L,
1, 0L
)
profile_treatmen_a %>%
ggplot(aes(x, y)) +
geom_step(direction = "vh") +
expand_limits(x = c(0, 1), y = 0)
(Ideally the area under the curve would be shaded)
Ideal solution: via ggridges
The goal of the visualization would be to compare the "sequence-profile" of many treatment-groups at the same time. If I could prepare the data accordingly, I would like to use the ggridges-package for a striking visual comparison the treatment groups.
library(ggridges)
data.frame(group = rep(letters[1:2], each=20),
mean = rep(2, each=20)) %>%
mutate(count = runif(nrow(.))) %>%
ggplot(aes(x=count, y=group, fill=group)) +
geom_ridgeline(stat="binline", binwidth=0.5, scale=0.9)
You could build helper intervals and then just plot a histogram. Since each patient is either in Group A or B both groups sum up to 100%. With these helper intervals you could also easily switch to other geoms.
library(tidyverse, warn.conflicts = FALSE)
library(ggplot2)
# create sample data
set.seed(42)
id <- 1:10 %>% map(~ rep(x = .x, times = runif(n = 1, min = 1, max = 6))) %>%
unlist()
group <- sample(x = c("A", "B"), size = length(id), replace = TRUE) %>%
as_factor()
df <- tibble(id, group)
glimpse(df)
#> Observations: 37
#> Variables: 2
#> $ id <int> 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 4, 4, 4, 4, 4, 5, 5,...
#> $ group <fct> A, B, B, A, A, B, B, A, A, B, B, A, B, B, A, B, A, B, A,...
# tidy data
df <- df %>%
group_by(id) %>%
mutate(from = (row_number() - 1) / n(),
to = row_number() / n()) %>%
ungroup() %>%
rowwise() %>%
mutate(list = seq(from + 1/60, to, 1/60) %>% list()) %>%
unnest()
# plot
df %>%
ggplot(aes(x = list, fill = group)) +
geom_histogram(binwidth = 1/60) +
ggthemes::theme_hc()
Created on 2018-09-16 by the [reprex package](http://reprex.tidyverse.org) (v0.2.0).
My attempt at an answer.. although it is probably not the nicest/fastest/most efficient way, I think it might help you in your efforts.
library(data.table)
# compute "intervals" for each person [start, end]
df <- cj_df_raw %>%
group_by(id) %>%
mutate(pos = row_number(),
len = length(id),
from = (pos - 1) / len,
to = pos / len,
value = 1)
dt <- as.data.table(df)
setkey(dt, from, to)
#create intervals
dt.interval <- data.table(from = seq( from = 0, by = 0.01, length.out = 100),
to = seq( from = 0.01, by = 0.01, length.out = 100))
#perform overlap join on intervals
dt2 <- foverlaps( dt.interval, dt, type = "within", nomatch = NA)[, sum(value), by = c("i.from", "group")]
#some melting ans casting to fill in '0' on empty intervals
dt3 <- melt( dcast(dt2, ... ~ group, fill = 0), id.vars = 1 )
#plot
ggplot( dt3 ) +
geom_step( aes( x = i.from, y = value, color = variable ) ) +
facet_grid( .~variable )

Resources