how to plot number of valid rows with ggplot2 - r

With a dataframe as
df <- data.frame(name = c("a", "b", "c", "d", "e"),
class = c("a1", "a1", "a1", "b1", "b1"),
var1 = c("S", "S", "R", "S", "S"),
var2 = c("S", "R", NA, NA, "R"),
var3 = c(NA, "R", "R", "S", "S"))
I would like to plot the number of rows without NAs for var1 from var3.
One way I found is to generate another dataframe as
df_count <- matrix(nrow=3, ncol=2)
df_count <- as.data.frame(df_count)
names(df_count) <- c("var_num", "count")
df_count$var_num <- as.factor(names(df)[3:5])
for (i in 1:3) {
df_count[i,2] <- sum(!is.na(df[,i+2]))
}
and then plot as
ggplot(df_count, aes(x=var_num, y=count)) + geom_bar(stat="identity")
Is there an easier way to choose var1 through var3 and count the valid rows without generating a new dataframe?

library('ggplot2')
library('reshape2')
df <- melt(df, id.vars = c('name', 'class')) # melt data
df <- df[!is.na(df$value), ] # remove NA
df <- with(df, aggregate(df, by = list(variable), FUN = length )) # compute length by grouping variable
ggplot(df, aes( x = Group.1, y = value, fill = Group.1 )) +
geom_bar(stat="identity")
stacked bar
df <- melt(df, id.vars = c('name', 'class')) # melt data
df <- df[!is.na(df$value), ] # remove NA
df <- with(df, aggregate(df, by = list(variable, value), FUN = length )) # compute length by grouping variable and value
ggplot(df, aes( x = Group.1, y = value, fill = Group.2 )) +
geom_bar(stat="identity")
Data:
df <- data.frame(name = c("a", "b", "c", "d", "e"),
class = c("a1", "a1", "a1", "b1", "b1"),
var1 = c("S", "S", "R", "S", "S"),
var2 = c("S", "R", NA, NA, "R"),
var3 = c(NA, "R", "R", "S", "S"))

Related

Counting the occurrence of a word but only once per row (R)

I want to count the number of times a word appears but only once per row. How do I complete the code?
library(stringr)
var1 <- c("x", "x", "x", "x", "x", "x", "y", "y", "y", "y")
var2 <- c("x", "x", "b", "b", "c", "d", "e", "y", "g", "h")
var3 <- c("x", "x", "b", "b", "c", "d", "e", "y", "g", "h")
data <- data.frame(cbind(var1, var2, var3))
sum(str_count(data, "x"))
The result should be 6.
The following should do the trick:
sum(rowSums(data == "x") >= 1) # Thanks Maël
# [1] 6
which will check if there is at least one value per row (rowSums()) and add all the rows with 1+ up using sum()
Or alternatively (per Antreas's comment so it is not missed):
length(which(rowSums(data == "x") != 0)) # Thanks to Antreas Stefopoulos
Which counts the number of non-zero rows with length()

Plotting based on occurrence in group

I would to make a bar chart that plots the bar as a proportion of the total group rather than the usual percentage. For a var to "count" it only needs to occur once in a group. For example in this df where id is the grouping variable
df <-
tibble(id = c(rep(1, 3), rep(2, 3), rep(3, 3)),
vars = c("a", NA, "b", "c", "d", "e", "a", "a", "a"))
The a bars would be:
a = 2/3 # since a occurs in 2 out of 3 groups
b = 1/3
c = 1/3
d = 1/3
e = 1/3
If I understand you correctly, a one-liner would suffice:
ggplot(distinct(df)) + geom_bar(aes(vars, stat(count) / n_distinct(df$id)))
Working answer:
tibble(id = c(rep(1, 3), rep(2, 3), rep(3, 3)),
vars = c("a", "a", "b", "c", "d", "e", "a", "a", "a")) %>%
group_by(id) %>%
distinct(vars) %>%
ungroup() %>%
add_count(vars) %>%
mutate(prop = n / n_distinct(id)) %>%
distinct(vars, .keep_all = T) %>%
ggplot(aes(vars, prop)) +
geom_col()

calculate duration in a complex table

I have a table as shown.
df <- data.frame("name" = c("jack", "william", "david", "john"),
"01-Jan-19" = c(NA,"A",NA,"A"),
"01-Feb-19" = c("A","A",NA,"A"),
"01-Mar-19" = c("A","A","A","A"),
"01-Apr-19" = c("A","A","A","A"),
"01-May-19" = c(NA,"A","A","A"),
"01-Jun-19" = c("A","SA","A","SA"),
"01-Jul-19" = c("A","SA","A","SA"),
"01-Aug-19" = c(NA,"SA","A","SA"),
"01-Sep-19" = c(NA,"SA","A","SA"),
"01-Oct-19" = c("SA","SA","A","SA"),
"01-Nov-19" = c("SA","SA",NA,"SA"),
"01-Dec-19" = c("SA","SA","SA",NA),
"01-Jan-20" = c("SA","M","A","M"),
"01-Feb-20" = c("M","M","M","M"))
Over a time period, each person journeys through of position progression (3 position categories from A to SA to M). My objective is:
Calculate the average duration of A (assistant) position and SA (senior assistant) position. i.e. the duration between the date the first of one category appears, and the date the last of this category appears, regardless of missing data in between.
I transposed the data using R “gather” function
df1 <- gather (df, "date", "position", 2:15)
then I am not sure how to best proceed. What might be the best way to further approach this?
We can get the data in longer format and calculate the number of days between first date when the person was "SA" and the first date when he was "A".
library(dplyr)
df %>%
tidyr::pivot_longer(cols = -name, names_to = 'person', values_drop_na = TRUE) %>%
mutate(person = dmy(person)) %>%
group_by(name) %>%
summarise(avg_duration = person[match('SA', value)] - person[match('A', value)])
# name duration
# <fct> <drtn>
#1 david 275 days
#2 jack 242 days
#3 john 151 days
#4 william 151 days
If needed the mean value we can pull and then calculate mean by adding to the above chain
%>% pull(duration) %>% mean
#Time difference of 204.75 days
data
df <- structure(list(name = c("jack", "william", "david", "john"),
`01-Jan-19` = c(NA, "A", NA, "A"), `01-Feb-19` = c("A", "A",
NA, "A"), `01-Mar-19` = c("A", "A", "A", "A"), `01-Apr-19` = c("A",
"A", "A", "A"), `01-May-19` = c(NA, "A", "A", "A"), `01-Jun-19` = c("A",
"SA", "A", "SA"), `01-Jul-19` = c("A", "SA", "A", "SA"),
`01-Aug-19` = c(NA, "SA", "A", "SA"), `01-Sep-19` = c(NA,
"SA", "A", "SA"), `01-Oct-19` = c("SA", "SA", "A", "SA"),
`01-Nov-19` = c("SA", "SA", NA, "SA"), `01-Dec-19` = c("SA",
"SA", "SA", NA), `01-Jan-20` = c("SA", "M", "A", "M"), `01-Feb-20` = c("M",
"M", "M", "M")), row.names = c(NA, -4L), class = "data.frame")

dply filter with exception

So I'm trying to filter out certain things in my dataset.
Here's a really parred down example of my dataset:
fish <- data.frame ("order"=c("a", "a", "a", "b", "b", "c", "c", "d", "d", "e", "e"),
"family"= c("r", "s", "t", "r", "y", "y", "y", "u", "y", "u", "y"),
"species"=c(7, 8, 9, 6, 5, 4, 3, 10, 1, 11, 2))
so I have
fish <- fish%>%
filter(
!(order %in% c("a", "b", "c"))&
!(family %in% c("r","s","t","u"))
)
which should remove all orders in a,b,c and all families in , r, s, t, u. Leaving me with
order family species
d y 10
e y 11
But the issue is, there are two species that are in families that I am filtering out. So say species 1 is in family "r". I want species 1 to stay in the dataset, while filtering all the rest of family r. So I want the output to look like:
order family species
d y 10
e y 11
d r 1
e r 2
How can I make sure that when I'm filtering out the groups of family, it keeps these two species?
Thanks!
You could rbind the results of three separate filters:
temp1<-filter(fish,order!=c("a","b","c")&family!=c("r","s","t","u"))
temp2<-filter(fish,family=="r"&species==1)
temp3<-filter(fish,family=="s"&species==2)
fish<-rbind(temp1,temp2,temp3)
rm(temp1,temp2,temp3)
It would be most natural to have the filtering process mirror your logic --
Filter #1: filter-out undesirable order and family
Filter #2: filter desirable family, species pairs
Note: I had to change your family, species pair criteria to get matches.
library(dplyr)
library(purrr)
# your example data
fish <- tibble ("order"=c("a", "a", "a", "b", "b", "c", "c", "d", "d", "e", "e"),
"family"= c("r", "s", "t", "r", "y", "y", "y", "u", "y", "u", "y"),
"species"=c(7, 8, 9, 6, 5, 4, 3, 10, 1, 11, 2))
# put filter criteria in variables
order_filter <- c('a', 'b', 'c')
family_filter <- c('r', 's', 't', 'u')
# Filter 1
df1 <- fish %>%
filter(!order %in% order_filter,
!family %in% family_filter)
# Filter 2
df2 <- map_df(.x = list(c('r', 7), c('s', 8)),
.f = function(x) {fish %>%
filter(family == x[1], species == x[2])})
# Combine two data frames created by Filter 1 and Filter 2
df_final <- bind_rows(df1, df2)
print(df_final)
# A tibble: 4 x 3
# order family species
# <chr> <chr> <dbl>
# 1 d y 1
# 2 e y 2
# 3 a r 7
# 4 a s 8

Cross table graph similar to excel

I need to put a 3D kind of graph similar to attached image created in excel. I am not sure whether we can do this in ggplot?
structure(list(Name = c("A", "B", "C", "D"), P = c(15089, NA,
NA, 43083), Q = c(1589, NA, NA, 18120), R = c(93751, NA, 4709,
211649), S = c(34167, 1323, 1520, 82378), T = c(8831, NA, 4544,
15157)), .Names = c("Name", "P", "Q", "R", "S", "T"), row.names = c(NA,
4L), class = "data.frame")
I have worked with this following code.
ggplot(a, aes(x = a$A, y = a$Amount, fill = a$B)) +
geom_col(position = 'stack') +
geom_text(aes(label = a$Amount), position = position_stack(vjust = .5),
color='grey25', size=2) + coord_flip()
The problem is the labels which shows on top the the graph is overlapping
Updated:
Actually, I thought I need to reshape the data to achieve this kind of graph, not so sure though. So I reshaped the like below
structure(list(AA = c("A", "A", "A", "A", "A", "B", "B", "B",
"B", "B", "C", "C", "C", "C", "C", "D", "D", "D", "D", "D"),
BB = c("P", "Q", "R", "S", "T", "P", "Q", "R", "S", "T",
"P", "Q", "R", "S", "T", "P", "Q", "R", "S", "T"), Amount = c(15089,
1589, 93751, 34167, 8831, NA, NA, NA, 1323, NA, NA, NA, 4709,
1520, 4544, 43083, 18120, 211649, 82378, 15157)), .Names = c("AA",
"BB", "Amount"), row.names = c(NA, 20L), class = "data.frame")
I tried the following code to achieve this to which the labels are overlapping
ggplot(a, aes(x = AA, y = Amount, fill = BB)) +
geom_col(position = 'stack')+
geom_text(aes(label = Amount),
position = position_stack(vjust = 0.2),
color='grey25',
size=2) +
coord_flip()
Also, when I supply this to ggploty for shiny, the graph is not coming in dashboard

Resources