I have a df that looks like this:
sample data can be build using codes:
df<-structure(list(ID = c(101, 101, 101, 101, 101, 101), AEDECOD = c("Diarrhoea",
"Vitreous floaters", "Musculoskeletal pain", "Diarrhoea", "Decreased appetite",
"Fatigue"), AESTDY = structure(c(101, 74, 65, 2, 33, 27), class = "difftime", units = "days"),
AEENDY = structure(c(105, 99, NA, 5, NA, NA), class = "difftime", units = "days")), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
I would like to make a plot that looks like following:
Sorry the the blurry plot. This is the closest one that I can find. What someone give me some guidance on how to make such plot?
Thanks.
With ggplot2, using Unicode's black "left pointer" and "right pointer" characters for the start and end arrows.
df %>%
ggplot(aes(y = AEDECOD, yend = AEDECOD, x = AESTDY, xend = AEENDY)) +
geom_point(aes(x = AESTDY), shape = "\u25BA") +
geom_point(aes(x = AEENDY), shape = "\u25C4") +
geom_segment()
This might get you started.
There are issues about what to do with or how to interpret NAs - this approach just ignores them - you do not get a line.
Start days are indicated by a point.
library(dplyr)
library(tidyr)
library(stringr)
library(ggplot2)
df1 <-
df %>%
mutate(across(ends_with("DY"), ~ as.numeric(str_extract(.x, "\\d+"))))
ggplot(df1)+
geom_segment(aes(y = AEDECOD, yend = AEDECOD, x = AESTDY, xend = AEENDY))+
geom_point(data = filter(df1, is.na(AEENDY)), aes(y = AEDECOD, x = AESTDY))
#> Warning: Removed 3 rows containing missing values (geom_segment).
Created on 2021-04-12 by the reprex package (v2.0.0)
Related
Using a dataframe with missing values:
structure(list(id = c("id1", "test", "rew", "ewt"), total_frq_1 = c(54, 87, 10, 36), total_frq_2 = c(45, 24, 202, 43), total_frq_3 = c(24, NA, 25, 8), total_frq_4 = c(36, NA, 104, NA)), row.names = c(NA, 4L), class = "data.frame")
How is is possible to create a bar plot with the mean for every column, excluding the id column, but without filling the missing values with 0 but leaving out the row with missing values example for total_frq_3 24+25+8 = 57/3 = 19
You can use colMeans function and pass it the appropriate argument to ignore NA.
library(ggplot2)
xy <- structure(list(id = c("id1", "test", "rew", "ewt"),
total_frq_1 = c(54, 87, 10, 36), total_frq_2 = c(45, 24, 202, 43), total_frq_3 = c(24, NA, 25, 8),
total_frq_4 = c(36, NA, 104, NA)),
row.names = c(NA, 4L),
class = "data.frame")
xy.means <- colMeans(x = xy[, 2:ncol(xy)], na.rm = TRUE)
xy.means <- as.data.frame(xy.means)
xy.means$total <- rownames(xy.means)
ggplot(xy.means, aes(x = total, y = xy.means)) +
theme_bw() +
geom_col()
Or just use base image graphic
barplot(height = colMeans(x = xy[, 2:ncol(xy)], na.rm = TRUE))
I hope I asked my question in the right way this time! If not let me know!
I want to code a grouped bar-chart similary to this one (I just created in paint):
enter image description here
I created as flipped both it actually doesn't matter if its flipped or not. So, a plot similarly to this will also be very usefull:
Grouped barchart in r with 4 variables
Both the variables, happy and lifesatisfied are scaled values from 0 to 10. Working hours is a grouped value and contains 43+, 37-42, 33-36, 27-32, and <27.
A very similar example of how my data set looks like (I just changed the values and order, I also have much more observations):
Working hours
happy
lifestatisfied
contry
37-42
7
9
DK
<27
8
8
SE
43+
7
8
DK
33-36
6
6
SE
37-42
7
5
NO
<27
4
7
NO
I tried to found similar examples and based on that tried to code the bar chart in the following way but it doesn't work:
df2 <- datafilteredwomen %>%
pivot_longer(cols = c("happy", "stflife"), names_to = "var", values_to = "Percentage")
ggplot(df2) +
geom_bar(aes(x = Percentage, y = workinghours, fill = var ), stat = "identity", position = "dodge") + theme_minimal()
It give this plot which is not correct/what I want:
enter image description here
seocnd try:
forplot = datafilteredwomen %>% group_by(workinghours, happy, stflife) %>% summarise(count = n()) %>% mutate(proportion = count/sum(count))
ggplot(forplot, aes(workinghours, proportion, fill = as.factor(happy))) +
geom_bar(position = "dodge", stat = "identity", color = "black")
gives this plot:
enter image description here
third try - used the ggplot2 builder add-in:
library(dplyr)
library(ggplot2)
datafilteredwomen %>%
filter(!is.na(workinghours)) %>%
ggplot() +
aes(x = workinghours, group = happy, weight = happy) +
geom_bar(position = "dodge",
fill = "#112446") +
theme_classic() + scale_y_continuous(labels = scales::percent)
gives this plot:
enter image description here
But none of my tries are what I want.. really hope that someone can help me if it's possible!
After speaking to the OP I found his data source and came up with this solution. Apologies if it's a bit messy, I have only been using R for 6 months. For ease of reproducibility I have preselected the variables used from the original dataset.
data <- structure(list(wkhtot = c(40, 8, 50, 40, 40, 50, 39, 48, 45,
16, 45, 45, 52, 45, 50, 37, 50, 7, 37, 36), happy = c(7, 8, 10,
10, 7, 7, 7, 6, 8, 10, 8, 10, 9, 6, 9, 9, 8, 8, 9, 7), stflife = c(8,
8, 10, 10, 7, 7, 8, 6, 8, 10, 9, 10, 9, 5, 9, 9, 8, 8, 7, 7)), row.names = c(NA,
-20L), class = c("tbl_df", "tbl", "data.frame"))
Here are the packages required.
require(dplyr)
require(ggplot2)
require(tidyverse)
Here I have manipulated the data and commented my reasoning.
data <- data %>%
select(wkhtot, happy, stflife) %>% #Select the wanted variables
rename(Happy = happy) %>% #Rename for graphical sake
rename("Life Satisfied" = stflife) %>%
na.omit() %>% # remove NA values
group_by(WorkingHours = cut(wkhtot, c(-Inf, 27, 32,36,42,Inf))) %>% #Create the ranges
select(WorkingHours, Happy, "Life Satisfied") %>% #Select the variables again
pivot_longer(cols = c(`Happy`, `Life Satisfied`), names_to = "Criterion", values_to = "score") %>% # pivot the df longer for plotting
group_by(WorkingHours, Criterion)
data$Criterion <- as.factor(data$Criterion) #Make criterion a factor for graphical reasons
A bit more data prep
# Creating the percentage
data.plot <- data %>%
group_by(WorkingHours, Criterion) %>%
summarise_all(sum) %>% # get the sums for score by working hours and criterion
group_by(WorkingHours) %>%
mutate(tot = sum(score)) %>%
mutate(freq =round(score/tot *100, digits = 2)) # get percentage
Creating the plot.
# Plotting
ggplot(data.plot, aes(x = WorkingHours, y = freq, fill = Criterion)) +
geom_col(position = "dodge") +
geom_text(aes(label = freq),
position = position_dodge(width = 0.9),
vjust = 1) +
xlab("Working Hours") +
ylab("Percentage")
Please let me know if there is a more concise or easier way!!
B
DataSource: https://www.europeansocialsurvey.org/downloadwizard/?fbclid=IwAR2aVr3kuqOoy4mqa978yEM1sPEzOaghzCrLCHcsc5gmYkdAyYvGPJMdRp4
Taking this example dataframe df:
df <- structure(list(Working.hours = c("37-42", "37-42", "<27", "<27",
"43+", "43+", "33-36", "33-36", "37-42", "37-42", "<27", "<27"
), country = c("DK", "DK", "SE", "SE", "DK", "DK", "SE", "SE",
"NO", "NO", "NO", "NO"), criterion = c("happy", "lifesatisfied",
"happy", "lifesatisfied", "happy", "lifesatisfied", "happy",
"lifesatisfied", "happy", "lifesatisfied", "happy", "lifesatisfied"
), score = c(7L, 9L, 8L, 8L, 7L, 8L, 6L, 6L, 7L, 5L, 4L, 7L)), row.names = c(NA,
-12L), class = c("tbl_df", "tbl", "data.frame"))
you can proceed like this:
library(dplyr)
library(ggplot2)
df <-
df %>%
pivot_longer(cols = c(happy, lifesatisfied),
names_to = 'criterion',
values_to = 'score'
)
df %>%
ggplot(aes(x = Working.hours,
y = score,
fill = criterion)) +
geom_col(position = 'dodge') +
coord_flip()
For picking colours see ?scale_fill_manual, for formatting legend etc. numerous existing answers to related questions on stackoverflow.
I'm trying to come up with a way to visualize some likert scale data in a specific way. I'm not even sure how to fake what it would look like, so I hope my explanation would suffice.
I have the following (fake) data:
n questions, each with 5 answers (strongly agree, agree, don't agree, strongly don't agree, and don't know)
I want to visualize the data (ideally using ggplot) along a central axis, so that the two "agree" answers are on the left, and the two "disagree" answers are on the right, and then on a separate block off to the side, a block representing "don't know". It should look roughly like this:
Q1: *****++++++++|------!! ?????
Q2: ****++++++|----!!!!!! ???????
Q3: **++++++|---!!! ??????????
*: strongly agree, +: agree, -: don't agree, !:strongly disagree, ?: don't know
As you can see, this representation allows to compare the actual numbers of agree and disagree, without hiding how many "don't knows" there are. The place I get stuck on is how to create that second element for the don't knows. Any ideas?
Here's some fake data:
structure(list(Q = structure(1:3, .Label = c("Q1", "Q2", "Q3"
), class = "factor"), SA = c(25, 18, 12), A = c(30, 25, 15),
DA = c(25, 20, 25), SDA = c(10, 18, 25), DK = c(10, 19, 23
)), row.names = c(NA, -3L), class = "data.frame")
As suggested in the comments, you can just facet out the "DK" category.
library(ggplot2)
library(tidyr)
library(scales)
df <- structure(list(Q = structure(1:3, .Label = c("Q1", "Q2", "Q3"
), class = "factor"), SA = c(25, 18, 12), A = c(30, 25, 15),
DA = c(25, 20, 25), SDA = c(10, 18, 25), DK = c(10, 19, 23
)), row.names = c(NA, -3L), class = "data.frame")
lvls <- colnames(df)[c(2,3,5,4,6)]
ggplot(
pivot_longer(df ,-1),
aes(y = Q, fill = name, group = factor(name, lvls),
x = ifelse(name %in% c("A", "SA"), -1, 1) * value)
) +
geom_col() +
facet_grid(~ name == "DK", scales = "free_x", space = "free_x") +
scale_fill_manual(
values = c(viridis_pal()(4), "grey50"),
limits = colnames(df)[-1]
)
Created on 2021-11-04 by the reprex package (v2.0.1)
I am using the package growthcurver to create a graph with a sigmoidal curve. It needs to have a logarithmic scale on the y axis.
This works to create a graph without a log scale:
# install.packages("growthcurver")
library("growthcurver")
gcfit <- SummarizeGrowth(curveA$time, curveA$biomass)
gcfit
plot(gcfit)
I have tried plot(gcfit, log=y) and plot(gcfit, log="curveA$biomass"). This gives me the error
'Non-numeric argument to mathematical function'.
Could it be that I am using a data frame? How do I get around this?
dput(curveA)
structure(list(time = c(1, 2, 3, 4, 5, 6, 7, 8, 17, 18, 19, 20,
21, 22, 23, 24, 25), biomass = c(0.153333333, 1.303333333, 2.836666667,
4.6, 6.21, 6.746666667, 7.283333333, 7.973333333, 8.663333333,
9.046666667, 10.19666667, 10.50333333, 11.04, 11.88333333, 11.96,
11.96, 9.966666667)), class = c("spec_tbl_df", "tbl_df", "tbl",
"data.frame"), row.names = c(NA, -17L), spec = structure(list(
cols = list(time = structure(list(), class = c("collector_number",
"collector")), biomass = structure(list(), class = c("collector_number",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), class = "col_spec"))
Without the data that you are using it is difficult to reproduce the problem (how to produce reproducible example: How to make a great R reproducible example), but the easiest solution to your problem might be using log function directly on the data...
gcfit_log <- SummarizeGrowth(curveA$time, log10(curveA$biomass))
plot(gcfit_log)
The workaround is to extract the data from the model and plot using ggplot2 package:
library(ggplot2)
library(dplyr)
gcfit <- SummarizeGrowth(curveA$time, curveA$biomass)
points_data <- bind_cols(t = gcfit$data$t, n = gcfit$data$N)
line_fit <- bind_cols(x = max(gcfit$data$t) * (1 : 30) / 30,
y = NAtT(gcfit$vals$k, gcfit$vals$n0, gcfit$vals$r,
max(gcfit$data$t) * (1 : 30) / 30))
ggplot(data = points_data, aes(t, n + 1)) +
geom_point() +
geom_line() +
geom_line(data = line_fit, aes(x, y + 1), color = "red") +
scale_y_log10()
I have a panel (cross-sectional time series) dataset. For each group (defined by (NAICS2, occ_type) in time ym) I have many variables. For each variable I would like to subtract each group's first (dplyr::first) value from every value of that group.
Ultimately I am trying to take the Euclidean difference between the vector of each row 's group's first entry, (i.e. sqrt(c_1^2 + ... + c_k^2).
I was able to create the a column equal to the first entries for each group:
df2 <- df %>%
group_by(ym, NAICS2, occ_type) %>%
distinct(ym, NAICS2, occ_type, .keep_all = T) %>%
arrange(occ_type, NAICS2, ym) %>%
select(group_cols(), ends_with("_scf")) %>%
mutate_at(vars(-group_cols(), ends_with("_scf")),
list(first = dplyr::first))
I then tried to include variations of f.diff = . - dplyr::first(.) in the list, but none of those worked. I googled the dot notation for a while as well as first and lag in dplyr timeseries but have not been able to resolve this yet.
Ideally, I unite all variables into a vector for each row first and then take the difference.
df2 <- df %>%
group_by(ym, NAICS2, occ_type) %>%
distinct(ym, NAICS2, occ_type, .keep_all = T) %>%
arrange(occ_type, NAICS2, ym) %>%
select(group_cols(), ends_with("_scf")) %>%
unite(vector, c(-group_cols(), ends_with("_scf")), sep = ',') %>%
# TODO: DISTANCE_BETWEEN_ENTRY_AND_FIRST
mutate(vector.diff = ???)
I expect the output to be a numeric column that contains a distance measure of how different each group's row vector is from its initial row vector.
Here is a sample of the data:
structure(list(ym = c("2007-01-01", "2007-02-01"), NAICS2 = c(0L,
0L), occ_type = c("is_middle_manager", "is_middle_manager"),
Administration_scf = c(344, 250), Agriculture..Horticulture..and.the.Outdoors_scf = c(11,
17), Analysis_scf = c(50, 36), Architecture.and.Construction_scf = c(57,
51), Business_scf = c(872, 585), Customer.and.Client.Support_scf = c(302,
163), Design_scf = c(22, 17), Economics..Policy..and.Social.Studies_scf = c(7,
7), Education.and.Training_scf = c(77, 49), Energy.and.Utilities_scf = c(25,
28), Engineering_scf = c(90, 64), Environment_scf = c(19,
19), Finance_scf = c(455, 313), Health.Care_scf = c(105,
71), Human.Resources_scf = c(163, 124), Industry.Knowledge_scf = c(265,
174), Information.Technology_scf = c(467, 402), Legal_scf = c(21,
17), Maintenance..Repair..and.Installation_scf = c(194, 222
), Manufacturing.and.Production_scf = c(176, 174), Marketing.and.Public.Relations_scf = c(139,
109), Media.and.Writing_scf = c(18, 20), Personal.Care.and.Services_scf = c(31,
16), Public.Safety.and.National.Security_scf = c(14, 7),
Religion_scf = c(0, 0), Sales_scf = c(785, 463), Science.and.Research_scf = c(52,
24), Supply.Chain.and.Logistics_scf = c(838, 455), total_scf = c(5599,
3877)), class = c("grouped_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -2L), groups = structure(list(ym = c("2007-01-01",
"2007-02-01"), NAICS2 = c(0L, 0L), occ_type = c("is_middle_manager",
"is_middle_manager"), .rows = list(1L, 2L)), row.names = c(NA,
-2L), class = c("tbl_df", "tbl", "data.frame"), .drop = TRUE))