geom_bar two datasets together in R

geom_bar two datasets together in R - r

I have two dataframes with two columns each, that I would like to plot together as a barplot using ggplot in R as shown below:
How can I do this using dplyr in R?
Sample Data:
DF1
Code Count_2020
A 1
B 2
C 3
D 4
E 5
F 6
DF2
Code Count_2021
A 4
B 8
C 6
D 8
E 10
F 12
So, I first thought of merging the two dataframes into one using dplyr::inner_join, and I got a new dataframe as shown below:
Code Count_2021 Count_2020
A 4 1
B 8 2
C 6 3
D 8 4
E 10 5
F 12 6
Next I thought of using dplyr::gather to plot the count data from both years together as Type and Value, but this messed up the gathered dataframe as the output changed to:
Type Value
Code A
Code B
Code C
Code D
Code E
Code F
Code I tried
library(tidyverse)
# Merge DF1 and DF2
DF = inner_join(DF1, DF2)
# Gather data for plotting
Gathered_DF= DF%>% dplyr::select(Code, Count_2020, Count_2021) %>%
gather(key = Type, value = Value) # Output not as expected, stuck!!

We can reshape to 'long' format with pivot_longer after the join and then use geom_col in ggplot2 with position specified as 'dodge' and fill as 'Year`
library(dplyr)
library(tidyr)
library(ggplot2)
inner_join(DF1, DF2) %>%
pivot_longer(cols = -Code, names_to = 'Year', names_prefix = 'Count_') %>%
ggplot(aes(x = Code, y = value, fill = Year)) +
geom_col(position = 'dodge') +
theme_bw()
-output
data
DF1 <- structure(list(Code = c("A", "B", "C", "D", "E", "F"),
Count_2020 = 1:6), class = "data.frame", row.names = c(NA,
-6L))
DF2 <- structure(list(Code = c("A", "B", "C", "D", "E", "F"), Count_2021 = c(4L,
8L, 6L, 8L, 10L, 12L)), class = "data.frame", row.names = c(NA,
-6L))

You can use pivot_longer instead of gather as it is superseded in tidyr 1.1.3
library(tidyverse)
df1 <- data.frame(Code = c("A", "B", "C", "D", "E", "F"),
Count_2020 = c(1,2,3,4,5,6))
df2 <- data.frame(Code = c("A", "B", "C", "D", "E", "F"),
Count_2021 = c(4, 8, 6, 8, 10, 12))
df_joined <- df1 %>%
inner_join(df2, by = "Code") %>%
pivot_longer(cols = !Code, names_to = "Year", names_prefix = "Count_", values_to = "Count")
df_joined
#> # A tibble: 12 x 3
#> Code Year Count
#> <fct> <chr> <dbl>
#> 1 A 2020 1
#> 2 A 2021 4
#> 3 B 2020 2
#> 4 B 2021 8
#> 5 C 2020 3
#> 6 C 2021 6
#> 7 D 2020 4
#> 8 D 2021 8
#> 9 E 2020 5
#> 10 E 2021 10
#> 11 F 2020 6
#> 12 F 2021 12
ggplot(df_joined, aes(x = Code, y = Count, fill = Year)) +
geom_bar(stat = "identity", position = "dodge")
In the code above, the argument inside pivot_longer are:
cols = !Code it means the column to be pivoted that is all column except Code
names_to = "Year" it means the name of column to be created for grouping
names_prefix = "Count_" is used to remove the string "Count_" from the created column "Year"
values_to = "Count" it means the name of column to created for stored value from each group.
You can learn more about this function by simply call ?pivot_longer

Use pivot_longer() to reshape your data then plot using ggplot.
Bonus: to add text on the bars, use geom_bar_text from the ggfittext package
library(tidyverse)
DF1 <- read.table(text = "Code Count_2020
A 1
B 2
C 3
D 4
E 5
F 6", header = TRUE)
DF2 <- read.table(text = "Code Count_2021
A 4
B 8
C 6
D 8
E 10
F 12", header = TRUE)
DF <- left_join(DF1, DF2, by = "Code")
DF_long <- DF %>%
pivot_longer(-Code,
names_to = c("tmp", "Year"),
names_sep = "\\_",
values_to = "Count") %>%
select(-tmp)
DF_long
#> # A tibble: 12 x 3
#> Code Year Count
#> <chr> <chr> <int>
#> 1 A 2020 1
#> 2 A 2021 4
#> 3 B 2020 2
#> 4 B 2021 8
#> 5 C 2020 3
#> 6 C 2021 6
#> 7 D 2020 4
#> 8 D 2021 8
#> 9 E 2020 5
#> 10 E 2021 10
#> 11 F 2020 6
#> 12 F 2021 12
plt <- ggplot(DF_long, aes(x = Code,
y = Count,
fill = Year)) +
geom_col(position = position_dodge(width = 0.9)) +
theme_minimal()
plt
library(ggfittext)
plt +
geom_bar_text(position = "dodge", reflow = TRUE)
Created on 2021-08-05 by the reprex package (v2.0.1)

I also found another way of doing it:
library(tidyverse)
DF1 = data.frame(Code = c("A", "B", "C", "D", "E", "F"),
Count_2020 = c(1,2,3,4,5,6))
DF2 = data.frame(Code = c("A", "B", "C", "D", "E", "F"),
Count_2021 = c(4, 8, 6, 8, 10, 12))
DF_Merged =
inner_join(DF1, DF2)
DFF_Merged = DF_Merged %>% dplyr::select(Code, Count_2020, Count_2021) %>%
gather(key = Type, value = Value, -Code) %>%
mutate(Type = ifelse(Type == "Count_2020", "2020", "2021"))
DFF_Merged %>%
ggplot(aes(x = reorder(Code,Value), y = Value, fill = Type,
text = paste("Count:", Value,
"<br>", "Offense Code:", Code,
"<br>", "Year:", Type))) +
geom_col(position = "dodge", show.legend = FALSE) +
xlab("Offense Code") +
ylab("Count") +
ggtitle("Arrest Counts for Group 1 in Year 2020 and 2021") +
theme(axis.text=element_text(size=8))
Result

Related

R - cleaning data with repeated columns for different locations

#Edited to make my data more similar to the data I'm working with and example of what I have tried
I am working with a Qualtrics survey where blocks of questions repeat themselves based on previous questions using a function in the survey build called "loop and merge". I'm trying to pull out like questions and then use rbind so that each question only shows up once in a column. I have a basic example below, however in my actual data, the repeats happen 36 times.
example data frame:
capacity_1 <- data.frame("1_q1" = 1:4,
"1_q2" = c("a", "b", "c", "d"),
'1_q3' = 10:13,
'1_q4' = 100:103,
'1_q5' = 110:113,
'1_q6' = 11:14,
"2_q1" = 22:25,
"2_q2" = c("i", "j", "k", "l"),
'2_q3' = 20:23,
'2_q4' = 200:203,
'2_q5' = 210:213,
'2_q6' = 21:24,
"3_q1" = 90:93,
"3_q2" = c("p", "q", "r", "s"),
'3_q3' = 10:13,
'3_q4' = 300:303,
'3_q5' = 310:313,
'3_q6' = 31:34,check.names = FALSE)
note that the "1_" at the start of "1_q1" is the county's reference number
What I could do but that is inefficient, especially since my actual data repeats these questions 36 times:
dat_1 <- dat %>%
select(1:2) %>%
rename(q = 1:2) %>%
mutate("county" = 1)
dat_2 <- dat %>%
select(3:4) %>%
rename(q = 1:2) %>%
mutate("county" = 2)
dat_3 <- dat %>%
select(5:6) %>%
rename(q = 1:2)%>%
mutate("county" = 3)
dat_final <- rbind(dat_1, dat_2, dat_3)
the "dat_final" data frame is what I'd like the data to look like, but also have formatted again here:
dat_clean <- data.frame("q1" = c(1:4, 22:25, 90:93),
"q2" = c("a", "b", "c", "d",
"i", "j", "k", "l",
"p", "q", "r", "s"),
"county" = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3))
Update - Tried suggestion below, and get the error "error in "set_names()" the size of 'nm' (6) must be compatible with the size of 'x'(2)
do.call(
rbind,
lapply(seq(1,ncol(capacity_1),6), \(i) {
capacity_1 %>%
select(c(i,i+5)) %>%
rename_all(~c("capacity_outpatient", "capacity_inpatient", "capacity_housing",
"capacity_recovery", "capacity_demand", "capacity_notes")) %>%
mutate(county=(i+5)/6)
})
)

You can do the following, which uses a seq from 1 to ncol(dat), by 2:
do.call(
rbind,
lapply(seq(1,ncol(dat),2), \(i) {
dat %>% select(c(i,i+1)) %>% rename_all(~c("q1","q2")) %>% mutate(county=(i+1)/2)
})
)
Output:
q1 q2 county
1 1 a 1
2 2 b 1
3 3 c 1
4 4 d 1
5 22 i 2
6 23 j 2
7 24 k 2
8 25 l 2
9 90 p 3
10 91 q 3
11 92 r 3
12 93 s 3
Another approach, with data.table
library(data.table)
setDT(dat)
rbindlist(lapply(seq(1,ncol(dat),2), \(i) {
setnames(dat[,i:(i+1)],c("q1","q2"))
}), use.names=F,idcol = "county")
Output:
county q1 q2
1: 1 1 a
2: 1 2 b
3: 1 3 c
4: 1 4 d
5: 2 22 i
6: 2 23 j
7: 2 24 k
8: 2 25 l
9: 3 90 p
10: 3 91 q
11: 3 92 r
12: 3 93 s

A solution using dplyr, purrr, stringr - This solution is not affected by columns orders, number of q columns. It just use the perfix as base for processing data.
library(dplyr)
library(purrr)
library(stringr)
dat <- data.frame("1_q1" = 1:4,
"1_q2" = c("a", "b", "c", "d"),
"2_q1" = 22:25,
"2_q2" = c("i", "j", "k", "l"),
"3_q1" = 90:93,
"3_q2" = c("p", "q", "r", "s"), check.names = FALSE)
# Here is the indexes of county that want to extract from df
county_index <- c("1", "2", "3")
# Function that take index as input and will extract data from `dat` df
edit_df <- function(index) {
dat %>%
# select column start with index prefix
select(matches(paste0(index, "_"))) %>%
# remove the index prefix from string
rename_all(~ str_replace(., regex("^\\d+_", ignore_case = TRUE), "")) %>%
# add county column with the input inex
mutate("county" = as.numeric(index))
}
Result using purrr::map_dfr
# map the county index that want to extract from original df and edit_df function
dat_clean <- map_dfr(.x = county_index, .f = edit_df)
dat_clean
#> q1 q2 county
#> 1 1 a 1
#> 2 2 b 1
#> 3 3 c 1
#> 4 4 d 1
#> 5 22 i 2
#> 6 23 j 2
#> 7 24 k 2
#> 8 25 l 2
#> 9 90 p 3
#> 10 91 q 3
#> 11 92 r 3
#> 12 93 s 3
Created on 2022-05-25 by the reprex package (v2.0.1)

how to duplicate rows with certain condition and create anew variable at the same time

I have a df like below and I would like to transfer it to sth like the table on the right, how can I duplicate the rows with Type=="N" and add new var Grade?
Basically, if Type==N, then Grade can be S or W, that is why we need to duplicate the rows.
df<-structure(list(Type = c("N", "N", "S", "W"), Result = c(8, 9,
7, 6)), row.names = c(NA, -4L), class = c("tbl_df", "tbl", "data.frame"
))

Using some functions from tidyverse, you can use crossing to duplicate rows and add the "Grade" column at the same time, then filter to match your stated rules.
library(tidyverse)
result <- df %>%
crossing(data.frame(Grade = c('S', 'W'))) %>%
filter(Type == 'N' | Type == Grade)
Type Result Grade
<chr> <dbl> <chr>
1 N 8 S
2 N 8 W
3 N 9 S
4 N 9 W
5 S 7 S
6 W 6 W

I think this approach is extensible to many more conditions assuming yours is the minimal example and you have a larger more complicated dataset.
library(dplyr)
df<-structure(list(Type = c("N", "N", "S", "W"), Result = c(8, 9,
7, 6)), row.names = c(NA, -4L), class = c("tbl_df", "tbl", "data.frame"
))
df2 <- data.frame(Type2 = c("N", "N"), Grade = c("S", "W"))
df %>%
select(Type, Result) %>%
left_join(df2, by = c("Type" = "Type2")) %>%
mutate(Grade = case_when(Type == "S" ~ "S", Type == "W" ~ "W", TRUE ~ Grade))
Type Result Grade
<chr> <dbl> <chr>
1 N 8 S
2 N 8 W
3 N 9 S
4 N 9 W
5 S 7 S
6 W 6 W

Another option is to use if_else() (or case_when() if there are more complex conditions) to return a list column of multiple values and unnest:
library(dplyr)
library(tidyr)
df %>%
mutate(Grade = if_else(Type == "N", list(c("S", "W")), as.list(Type))) %>%
unnest(Grade)
# A tibble: 6 x 3
Type Result Grade
<chr> <dbl> <chr>
1 N 8 S
2 N 8 W
3 N 9 S
4 N 9 W
5 S 7 S
6 W 6 W
Or:
df %>%
mutate(Grade = case_when(Type == "N" ~ list(c("S", "W")),
TRUE ~ as.list(Type))) %>%
unnest(Grade)

A dplyr way:
We could use bind_rows after using slice.
library(dplyr)
df %>%
slice(1:2) %>%
bind_rows(df) %>%
group_by(Type) %>%
arrange(Result, .by_group = TRUE) %>%
ungroup() %>%
mutate(Grade = rep(c("S","W"),length.out = n()), .before=2)
Type Grade Result
<chr> <chr> <dbl>
1 N S 8
2 N W 8
3 N S 9
4 N W 9
5 S S 7
6 W W 6

Here is a possible data.table option:
library(data.table)
dt <- as.data.table(df)
output <- dt[, CJ(.SD$Type, c('S', 'W')), .(Result)][which(V1 == 'N' | V1 == V2), ]
setnames(output, c(names(dt), "Grade"))
setcolorder(output, c("Result", "Grade", "Type"))
Output
Result Grade Type
1: N S 8
2: N W 8
3: N S 9
4: N W 9
5: S S 7
6: W W 6

Assign a value to a column in R based on a percentage within each group

[]
1I need to create column C in a data frame where 30% of the rows within each group (column B) get a value 0.
How do I do this in R?

We may use rbinom after grouping by 'category' column. Specify the prob as a vector of values
library(dplyr)
df1 %>%
group_by(category) %>%
mutate(value = rbinom(n(), 1, c(0.7, 0.3))) %>%
ungroup
-output
# A tibble: 9 x 3
sno category value
<int> <chr> <int>
1 1 A 1
2 2 A 0
3 3 A 1
4 4 B 1
5 5 B 0
6 6 B 1
7 7 C 1
8 8 C 0
9 9 C 0
data
df1 <- structure(list(sno = 1:9, category = c("A", "A", "A", "B", "B",
"B", "C", "C", "C")), class = "data.frame", row.names = c(NA,
-9L))

If your data already exist (assuming this is a simplified answer), and if you want the value to be randomly assigned to each group:
library(dplyr)
d <- data.frame(sno = 1:9,
category = rep(c("A", "B", "C"), each = 3))
d %>%
group_by(category) %>%
mutate(value = sample(c(rep(1, floor(n()*.7)), rep(0, n() - floor(n()*.7)))))

Base R
set.seed(42)
d$value <- ave(
rep(0, nrow(d)), d$category,
FUN = function(z) sample(0:1, size = length(z), prob = c(0.3, 0.7), replace = TRUE)
)
d
# sno category value
# 1 1 A 0
# 2 2 A 0
# 3 3 A 1
# 4 4 B 0
# 5 5 B 1
# 6 6 B 1
# 7 7 C 0
# 8 8 C 1
# 9 9 C 1
Data copied from Brigadeiro's answer:
d <- structure(list(sno = 1:9, category = c("A", "A", "A", "B", "B", "B", "C", "C", "C")), class = "data.frame", row.names = c(NA, -9L))

How can I use purrr to pivot a nested dataframe?

The code below creates a simplified version of the dataframe and illustrates my desired end result (df_wider) based on the unnested version. My question is: How can I achieve the same end result (df_wider) from the nested version (nested_df), using purrr?
library(tidyverse)
df <- tibble(id_01 = c(rep("01", 3), rep("02", 3)),
a = (c("a", "a", "b", "c", "c", "d")),
b = letters[7:12],
id_02 = rep(c(1, 2, 1), 2)
)
df_wider <- pivot_wider(df,
id_cols = c(id_01, a),
names_from = id_02,
values_from = b,
names_sep = "_"
)
nested_df <- nest(df, data = -id_01)
To be clear, I am trying to pivot while the dataframes are nested (i.e., before unnesting).

We can use purrr::map() within dplyr::mutate():
library(tidyverse)
df <- tibble(
id_01 = c(rep("01", 3), rep("02", 3)),
a = (c("a", "a", "b", "c", "c", "d")),
b = letters[7:12],
id_02 = rep(c(1, 2, 1), 2)
)
nested_df <- df %>%
nest(data = -id_01) %>%
mutate(data = map(data, ~ .x %>%
pivot_wider(
id_cols = a,
names_from = id_02,
values_from = b
)))
nested_df
#> # A tibble: 2 x 2
#> id_01 data
#> <chr> <list>
#> 1 01 <tibble [2 x 3]>
#> 2 02 <tibble [2 x 3]>
nested_df %>%
unnest(data)
#> # A tibble: 4 x 4
#> id_01 a `1` `2`
#> <chr> <chr> <chr> <chr>
#> 1 01 a g h
#> 2 01 b i <NA>
#> 3 02 c j k
#> 4 02 d l <NA>
Created on 2021-03-26 by the reprex package (v1.0.0)

How do I select column based on value in another column with dplyr?

My data frame looks like this:
id A T C G ref var
1 1 10 15 7 0 A C
2 2 11 9 2 3 A G
3 3 2 31 1 12 T C
I'd like to create two new columns: ref_count and var_count which will have following values:
Value from A column and value from C column, since ref is A and var is C
Value from A column and value from G column, since ref is A and var is G
etc.
So I'd like to select a column based on the value in another column for each row.
Thanks!

We can use pivot_longer to reshape into 'long' format, filter the rows and then reshape it to 'wide' format with pivot_wider
library(dplyr)
library(tidyr)
df1 %>%
pivot_longer(cols = A:G) %>%
group_by(id) %>%
filter(name == ref|name == var) %>%
mutate(nm1 = c('ref_count', 'var_count')) %>%
ungroup %>%
select(id, value, nm1) %>%
pivot_wider(names_from = nm1, values_from = value) %>%
left_join(df1, .)
# A tibble: 3 x 9
# id A T C G ref var ref_count var_count
#* <int> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
#1 1 10 15 7 0 A C 10 7
#2 2 11 9 2 3 A G 11 3
#3 3 2 31 1 12 T C 31 1
Or in base R, we can also make use of the vectorized row/column indexing
df1$refcount <- as.matrix(df1[2:5])[cbind(seq_len(nrow(df1)), match(df1$ref, names(df1)[2:5]))]
df1$var_count <- as.matrix(df1[2:5])[cbind(seq_len(nrow(df1)), match(df1$var, names(df1)[2:5]))]
data
df1 <- structure(list(id = 1:3, A = c(10, 11, 2), T = c(15, 9, 31),
C = c(7, 2, 1), G = c(0, 3, 12), ref = c("A", "A", "T"),
var = c("C", "G", "C")), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame"))

The following is a tidyverse alternative without creating a long dataframe that needs filtering. It essentially uses tidyr::nest() to nest the dataframe by rows, after which the correct column can be selected for each row.
df1 %>%
nest(data = -id) %>%
mutate(
data = map(
data,
~mutate(., refcount = .[[ref]], var_count = .[[var]])
)
) %>%
unnest(data)
#> # A tibble: 3 × 9
#> id A T C G ref var refcount var_count
#> <int> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
#> 1 1 10 15 7 0 A C 10 7
#> 2 2 11 9 2 3 A G 11 3
#> 3 3 2 31 1 12 T C 31 1
A variant of this does not need the (assumed row-specific) id column but defines the nested groups from the unique values of ref and var directly:
df1 %>%
nest(data = -c(ref, var)) %>%
mutate(
data = pmap(
list(data, ref, var),
function(df, ref, var) {
mutate(df, refcount = df[[ref]], var_count = df[[var]])
}
)
) %>%
unnest(data)
The data were specified by akrun:
df1 <- structure(list(id = 1:3, A = c(10, 11, 2), T = c(15, 9, 31),
C = c(7, 2, 1), G = c(0, 3, 12), ref = c("A", "A", "T"),
var = c("C", "G", "C")), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame"))

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

geom_bar two datasets together in R - r

Related

R - cleaning data with repeated columns for different locations

how to duplicate rows with certain condition and create anew variable at the same time

Assign a value to a column in R based on a percentage within each group

How can I use purrr to pivot a nested dataframe?

How do I select column based on value in another column with dplyr?

Categories

Resources