Plotting a scatter plot in R - r

I have a data structure that I got as a result of the problem stated here.
Code:
df <- tibble::tribble(~person, ~age, ~height,
"John", 1, 20,
"Mike", 3, 50,
"Maria", 3, 52,
"Elena", 6, 90,
"Biden", 9, 120)
df %>%
mutate(
age_c = cut(
age,
breaks = c(-Inf, 5, 10),
labels = c("0-5", "5-10"),
right = TRUE
),
height_c = cut(
height,
breaks = c(-Inf, 50, 100, 200),
labels = c("0-50", "50-100", "100-200"),
right = TRUE
)
) %>%
count(age_c, height_c, .drop = FALSE)
# A tibble: 6 x 3
age_c height_c n
<fct> <fct> <int>
1 0-5 0-50 2
2 0-5 50-100 1
3 0-5 100-200 0
4 5-10 0-50 0
5 5-10 50-100 1
6 5-10 100-200 1
Now I am trying to create a scatter plot but I have a problem that it seems like the code is not noticing that the values on the X and Y axis are repeating. Instead, it is repeating them. So, I would expect my x-axis to have two values 0-5 and 5-10 (what I get is 0-5,0-5,0-5,5-10,5-10,5-10), and the y-axis three values 0-50, 50-100 and 100-200 (instead I have two series of them).
The code I use to plot:
ggplot(df, aes(x=age_c, y=height_c))
Expected plot (where the size of circles would be based on the value of N):

If you plot the count data.frame it should work:
countdf = df %>%
mutate(
age_c = cut(
age,
breaks = c(-Inf, 5, 10),
labels = c("0-5", "5-10"),
right = TRUE
),
height_c = cut(
height,
breaks = c(-Inf, 50, 100, 200),
labels = c("0-50", "50-100", "100-200"),
right = TRUE
)
) %>%
count(age_c, height_c, .drop = FALSE)
countdf %>%
filter(n>0) %>%
ggplot(aes(x=age_c,y=height_c,size=n)) +
geom_point() +
scale_size_continuous(range=c(5,10),breaks=c(1,2))

Related

How to easily generate/simulate example data with different groups for modelling

How to easily generate/simulate meaningful example data for modelling: e.g. telling that give me n rows of data, for 2 groups, their sex distributions and mean age should differ by X and Y units, respectively? Is there a simple way for doing it automatically? Any packages?
For example, what would be the simplest way for generating such data?
groups: two groups: A, B
sex: different sex distributions: A 30%, B 70%
age: different mean ages: A 50, B 70
PS! Tidyverse solutions are especially welcome.
My best try so far is still quite a lot of code:
n=100
d = bind_rows(
#group A females
tibble(group = rep("A"),
sex = rep("Female"),
age = rnorm(n*0.4, 50, 4)),
#group B females
tibble(group = rep("B"),
sex = rep("Female"),
age = rnorm(n*0.3, 45, 4)),
#group A males
tibble(group = rep("A"),
sex = rep("Male"),
age = rnorm(n*0.20, 60, 6)),
#group B males
tibble(group = rep("B"),
sex = rep("Male"),
age = rnorm(n*0.10, 55, 4)))
d %>% group_by(group, sex) %>%
summarise(n = n(),
mean_age = mean(age))
There are lots of ways to sample from vectors and to draw from random distributions in R. For example, the data set you requested could be created like this:
set.seed(69) # Makes samples reproducible
df <- data.frame(groups = rep(c("A", "B"), each = 100),
sex = c(sample(c("M", "F"), 100, TRUE, prob = c(0.3, 0.7)),
sample(c("M", "F"), 100, TRUE, prob = c(0.5, 0.5))),
age = c(runif(100, 25, 75), runif(100, 50, 90)))
And we can use the tidyverse to show it does what was expected:
library(dplyr)
df %>%
group_by(groups) %>%
summarise(age = mean(age),
percent_male = length(which(sex == "M")))
#> # A tibble: 2 x 3
#> groups age percent_male
#> <chr> <dbl> <int>
#> 1 A 49.4 29
#> 2 B 71.0 50

How do I just calculate the average for a specific set of rows in Rstudio

DF <- data.frame(Height = rnorm(100, 170, 5),
Weight = rnorm(100, 55, 5))
BMI = function(height,weight){(weight/(height)^2*10000)}
DF$bmi = BMI(DF$Height,DF$Weight)
DF$weight_group <-
cut(
x = DF$Weight,
breaks = c(0,60,70,Inf),
include.lowest = TRUE,
labels = c("0-60", "61-70", "71+")
)
DF$BMI_group <-
cut(
x = DF$bmi,
breaks = c(0, 19.75582, Inf),
include.lowest = TRUE,
labels = c("Below Average", "Above Average")
)
This is my code. I cannot figure out how to just calculate the average of the last half of the data frame. I didn't know how to add in gender, to make 50 males and 50 females, so this is my work around.
Are you looking for such a solution?
DF <- data.frame(Height = rnorm(100, 170, 5),
Weight = rnorm(100, 55, 5),
Gender = c(rep("male", 50), rep("female", 50)))
BMI <- function(height,weight){(weight/(height)^2*10000)}
library(dplyr)
DF %>%
group_by(Gender) %>%
mutate(bmi = BMI(Height, Weight)) %>%
summarise(mean_bmi = mean(bmi))
# A tibble: 2 x 2
Gender mean_bmi
* <chr> <dbl>
1 female 19.4
2 male 19.6
We may use sample to create the column, subset the 'Gender' for 'F', and apply the BMI
DF$Gender <- sample(c("F", "M"), nrow(DF), replace = TRUE, prob = c(0.5, 0.5))
with(subset(DF, Gender == "F"), mean(BMI(Height, Weight)))
If we want to get the mean of 'BMI' by 'BMI_group'
subdf <- subset(DF, Gender == "F")
with(subdf, tapply(BMI(Height, Weight), BMI_group, FUN = mean))
Below Average Above Average
17.57841 21.43003

How to add values ​between hours

I have code that breaks down hours with corresponding values ​​into quarters of an hour.
Unfortunately, when broken down into quarters of an hour, the values ​​are identical for the entire hour.
After adding quarters of an hour, I would also like to add values ​​between the original hours so that the graph is smooth and not sharp. How to do it, average it, interpolate it?
df <- data.frame(
h = 0:23,
x = c(22, 11, 5, 8 , 22, 88, 77, 7, 11, 5, 8 , 22, 88, 77, 11, 5, 8 , 22, 88, 77, 11, 5, 8 , 22))
library(dplyr)
library(stringr)
df %>%
data.frame(h = rep(df$h, each = 4), # quadruplicate rows
x = rep(df$x, each = 4)) %>% # quadruplicate rows
mutate(h.1 = str_pad(h.1, width = 2, side = "left", pad = "0"), # add leading '0'
qu = paste0(h.1, c(":00", ":15", ":30", ":45"))) %>% # create quarters
select( - c(h,x)) %>% # deselect obsolete cols
rename(c("h" = "h.1", "x" = "x.1"))
df %>%
ggplot() +
geom_point(aes(qu, x), color = "red", size = 2) +
labs(x= "", y = "",
title = "Example")
Here I make a "decimal hour" variable to simplify the calculations. We can also use hms::hms() to define a timestamp that ggplot2 can understand. I use base:approx here to interpolate between hourly points.
df2 <- df %>%
tidyr::uncount(4) %>% # make 4 copies of each row
mutate(h_dec = h + (0:3)/4,
h_time = hms::hms(hours = h_dec),
x = x * c(1, NA, NA, NA), # this is to make non-hourly into NA,
# so that approx only uses hourly
x_interp = approx(x = h, y = x, xout=h_dec)$y)
df2 %>%
ggplot() +
geom_point(aes(h_time, x_interp), color = "red", size = 2) +
labs(x= "", y = "",
title = "Example")

R: Tidy Aggregation of Sequence Data and Visualization of Stepfunctions

I have some patient data, where the individual patients change treatment groups over time. My goal is to visualize the sequence of group changes and aggregate this data into a "sequence profile" for each treatment group.
For each treatment group I would like to show, when it generally occurs
in the treatment cycle (say rather in the beginning or in the end). To account for the differing sequence length, I would like to standardize these profiles betweenn 0 (very beginning) and 1 (end).
I would like to find an efficient data preparation and visualization.
Mininmal Example
Structure of Data
library(dplyr)
library(purrr)
library(ggplot2)
# minimal data
cj_df_raw <- tibble::tribble(
~id, ~group,
1, "A",
1, "B",
2, "A",
2, "B",
2, "A"
)
# compute "intervals" for each person [start, end]
cj_df_raw %>%
group_by(id) %>%
mutate(pos = row_number(),
len = length(id),
start = (pos - 1) / len,
end = pos / len) %>%
filter(group == "A")
#> # A tibble: 3 x 6
#> # Groups: id [2]
#> id group pos len start end
#> <dbl> <chr> <int> <int> <dbl> <dbl>
#> 1 1 A 1 2 0 0.5
#> 2 2 A 1 3 0 0.333
#> 3 2 A 3 3 0.667 1
(So Id 1 was in group A in the first 50% of their sequence, and Id 2 was in Group A in the first 33% and the last 33% of their sequence. This means, that 2 Ids where between 0-33% of the sequence, 1 between 33-50%, 0 between 50-66% and 1 above 66%.)
This is the result I would like to achieve and I miss a chance to transform my data effectively.
Desired outcome
profile_treatmen_a <- tibble::tribble(
~x, ~y,
0, 0L,
0.33, 2L,
0.5, 1L,
0.66, 0L,
1, 1L,
1, 0L
)
profile_treatmen_a %>%
ggplot(aes(x, y)) +
geom_step(direction = "vh") +
expand_limits(x = c(0, 1), y = 0)
(Ideally the area under the curve would be shaded)
Ideal solution: via ggridges
The goal of the visualization would be to compare the "sequence-profile" of many treatment-groups at the same time. If I could prepare the data accordingly, I would like to use the ggridges-package for a striking visual comparison the treatment groups.
library(ggridges)
data.frame(group = rep(letters[1:2], each=20),
mean = rep(2, each=20)) %>%
mutate(count = runif(nrow(.))) %>%
ggplot(aes(x=count, y=group, fill=group)) +
geom_ridgeline(stat="binline", binwidth=0.5, scale=0.9)
You could build helper intervals and then just plot a histogram. Since each patient is either in Group A or B both groups sum up to 100%. With these helper intervals you could also easily switch to other geoms.
library(tidyverse, warn.conflicts = FALSE)
library(ggplot2)
# create sample data
set.seed(42)
id <- 1:10 %>% map(~ rep(x = .x, times = runif(n = 1, min = 1, max = 6))) %>%
unlist()
group <- sample(x = c("A", "B"), size = length(id), replace = TRUE) %>%
as_factor()
df <- tibble(id, group)
glimpse(df)
#> Observations: 37
#> Variables: 2
#> $ id <int> 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 4, 4, 4, 4, 4, 5, 5,...
#> $ group <fct> A, B, B, A, A, B, B, A, A, B, B, A, B, B, A, B, A, B, A,...
# tidy data
df <- df %>%
group_by(id) %>%
mutate(from = (row_number() - 1) / n(),
to = row_number() / n()) %>%
ungroup() %>%
rowwise() %>%
mutate(list = seq(from + 1/60, to, 1/60) %>% list()) %>%
unnest()
# plot
df %>%
ggplot(aes(x = list, fill = group)) +
geom_histogram(binwidth = 1/60) +
ggthemes::theme_hc()
Created on 2018-09-16 by the [reprex package](http://reprex.tidyverse.org) (v0.2.0).
My attempt at an answer.. although it is probably not the nicest/fastest/most efficient way, I think it might help you in your efforts.
library(data.table)
# compute "intervals" for each person [start, end]
df <- cj_df_raw %>%
group_by(id) %>%
mutate(pos = row_number(),
len = length(id),
from = (pos - 1) / len,
to = pos / len,
value = 1)
dt <- as.data.table(df)
setkey(dt, from, to)
#create intervals
dt.interval <- data.table(from = seq( from = 0, by = 0.01, length.out = 100),
to = seq( from = 0.01, by = 0.01, length.out = 100))
#perform overlap join on intervals
dt2 <- foverlaps( dt.interval, dt, type = "within", nomatch = NA)[, sum(value), by = c("i.from", "group")]
#some melting ans casting to fill in '0' on empty intervals
dt3 <- melt( dcast(dt2, ... ~ group, fill = 0), id.vars = 1 )
#plot
ggplot( dt3 ) +
geom_step( aes( x = i.from, y = value, color = variable ) ) +
facet_grid( .~variable )

echarts4r moving the plot down to make room for the legend

Is there a way to move the plot down so that there is some space between the legend and the plot area? Ideally have the chart area automatically spaced below the legend.
df <- data.frame(
x = seq(50),
y = rnorm(50, 10, 3),
z = rnorm(50, 11, 2),
w = rnorm(50, 9, 2)
)
df %>%
e_charts(x) %>%
e_line(w) %>%
e_line(y) %>%
e_line(z) %>%
e_legend(orient = 'vertical', left = 0, top = 0)
Use the e_grid function to adjust the "grid" on which the graph is plotted.
library(echarts4r)
df <- data.frame(
x = seq(50),
y = rnorm(50, 10, 3),
z = rnorm(50, 11, 2),
w = rnorm(50, 9, 2)
)
df %>%
e_charts(x) %>%
e_line(w) %>%
e_line(y) %>%
e_line(z) %>%
e_legend(
orient = 'vertical',
left = 0,
top = 0,
selectedMode = "single" # might be of use
) %>%
e_grid(left = 100, top = 5)
Plenty more options in the grid can be found here

Resources