filter DataFrame to exclude specific dates - r

I want to know how to filter a DataFrame to exclude specific and discrete dates.
# Input Data
dates = c("2021-03-31", "2021-05-02", "2021-06-30", "2021-10-22")
dates = as.Date(dates)
x = structure(list(Gender = c("Male", "Female", "Male", "Male", "Female",
"Male", "Female", "Female", "Female", "Female", "Female", "Female",
"Female", "Female", "Female", "Female", "Male", "Male", "Female",
"Female", "Female", "Male", "Female", "Female", "Male", "Female",
"Male", "Female", "Female", "Female", "Male", "Male", "Female",
"Female", "Female", "Female", "Female", "Female", "Female", "Male",
"Female", "Male", "Female", "Male", "Female", "Female", "Female",
"Male", "Male", "Female", "Female", "Female", "Male", "Male",
"Female", "Female", "Female", "Male", "Female", "Male", "Female",
"Male", "Female", "Female", "Female", "Female", "Female", "Female",
"Female", "Female", "Male", "Female", "Female", "Female", "Female",
"Female", "Male", "Male", "Female", "Male", "Female", "Female",
"Male", "Female", "Female", "Female", "Female", "Female", "Male",
"Female", "Female", "Male", "Female", "Female", "Female", "Female",
"Female"), `Termination Date` = c("2021-01-05", "2021-02-12",
"2021-02-22", "2021-02-24", "2021-03-12", "2021-03-12", "2021-03-24",
"2021-03-26", "2021-03-31", "2021-03-31", "2021-03-31", "2021-03-31",
"2021-03-31", "2021-03-31", "2021-03-31", "2021-03-31", "2021-03-31",
"2021-04-02", "2021-04-02", "2021-04-05", "2021-04-09", "2021-04-30",
"2021-05-05", "2021-05-11", "2021-05-11", "2021-05-14", "2021-05-21",
"2021-05-21", "2021-05-24", "2021-06-01", "2021-06-11", "2021-06-11",
"2021-06-14", "2021-06-24", "2021-06-27", "2021-06-27", "2021-06-27",
"2021-06-27", "2021-07-02", "2021-07-07", "2021-07-23", "2021-07-26",
"2021-07-26", "2021-07-27", "2021-07-30", "2021-08-02", "2021-08-06",
"2021-08-06", "2021-08-09", "2021-08-11", "2021-08-13", "2021-08-13",
"2021-08-13", "2021-08-13", "2021-08-16", "2021-08-18", "2021-08-20",
"2021-08-23", "2021-08-24", "2021-08-25", "2021-08-27", "2021-08-27",
"2021-08-30", "2021-08-30", "2021-08-31", "2021-09-01", "2021-09-03",
"2021-09-03", "2021-09-15", "2021-09-16", "2021-09-20", "2021-09-22",
"2021-09-23", "2021-09-23", "2021-09-24", "2021-09-24", "2021-10-01",
"2021-10-04", "2021-10-06", "2021-10-08", "2021-10-08", "2021-10-08",
"2021-10-11", "2021-10-14", "2021-10-19", "2021-10-20", "2021-10-21",
"2021-10-22", "2021-10-22", "2021-10-29", "2021-11-02", "2021-11-03",
"2021-11-08", "2021-11-09", "2021-11-16", "2021-11-16", "2021-11-17"
)), row.names = c(229L, 8247L, 3068L, 7222L, 3746L, 3912L, 8019L,
3610L, 6078L, 6085L, 6271L, 6284L, 6285L, 6310L, 6321L, 6335L,
6336L, 3697L, 9149L, 8217L, 3734L, 220L, 6729L, 5562L, 7729L,
7933L, 5291L, 7232L, 1647L, 7335L, 3418L, 7189L, 2912L, 7790L,
6088L, 6247L, 6281L, 6338L, 7608L, 6614L, 410L, 2746L, 8296L,
3117L, 177L, 2788L, 3301L, 6221L, 5173L, 2092L, 3577L, 6219L,
6973L, 9020L, 1274L, 1768L, 8218L, 1822L, 2499L, 8107L, 1910L,
4756L, 2739L, 7342L, 7857L, 6519L, 2104L, 3666L, 7506L, 2635L,
3402L, 5566L, 2637L, 3036L, 2976L, 3871L, 8376L, 3112L, 4772L,
6449L, 8200L, 8445L, 3310L, 4005L, 3219L, 8241L, 8266L, 2995L,
3273L, 8401L, 3336L, 3118L, 2272L, 3333L, 3370L, 3952L, 7339L
), class = "data.frame")
Normally, I would do the following but I assume it doesn't work in this case since I am using a date class. How would I do this using dates?
# Filter df to exclude rows that were entered on a date from the list
x[!(x$`Termination Date` %in% dates), ]

When I run your example data, I see the Termination Date column is interpreted as the character class, not the date class.
Here is a solution that uses the tidyverse:
# Input Data
dates = c("2021-03-31", "2021-05-02", "2021-06-30", "2021-10-22")
x = structure(list(Gender = c("Male", "Female", "Male", "Male", "Female",
"Male", "Female", "Female", "Female", "Female", "Female", "Female",
"Female", "Female", "Female", "Female", "Male", "Male", "Female",
"Female", "Female", "Male", "Female", "Female", "Male", "Female",
"Male", "Female", "Female", "Female", "Male", "Male", "Female",
"Female", "Female", "Female", "Female", "Female", "Female", "Male",
"Female", "Male", "Female", "Male", "Female", "Female", "Female",
"Male", "Male", "Female", "Female", "Female", "Male", "Male",
"Female", "Female", "Female", "Male", "Female", "Male", "Female",
"Male", "Female", "Female", "Female", "Female", "Female", "Female",
"Female", "Female", "Male", "Female", "Female", "Female", "Female",
"Female", "Male", "Male", "Female", "Male", "Female", "Female",
"Male", "Female", "Female", "Female", "Female", "Female", "Male",
"Female", "Female", "Male", "Female", "Female", "Female", "Female",
"Female"), `Termination Date` = c("2021-01-05", "2021-02-12",
"2021-02-22", "2021-02-24", "2021-03-12", "2021-03-12", "2021-03-24",
"2021-03-26", "2021-03-31", "2021-03-31", "2021-03-31", "2021-03-31",
"2021-03-31", "2021-03-31", "2021-03-31", "2021-03-31", "2021-03-31",
"2021-04-02", "2021-04-02", "2021-04-05", "2021-04-09", "2021-04-30",
"2021-05-05", "2021-05-11", "2021-05-11", "2021-05-14", "2021-05-21",
"2021-05-21", "2021-05-24", "2021-06-01", "2021-06-11", "2021-06-11",
"2021-06-14", "2021-06-24", "2021-06-27", "2021-06-27", "2021-06-27",
"2021-06-27", "2021-07-02", "2021-07-07", "2021-07-23", "2021-07-26",
"2021-07-26", "2021-07-27", "2021-07-30", "2021-08-02", "2021-08-06",
"2021-08-06", "2021-08-09", "2021-08-11", "2021-08-13", "2021-08-13",
"2021-08-13", "2021-08-13", "2021-08-16", "2021-08-18", "2021-08-20",
"2021-08-23", "2021-08-24", "2021-08-25", "2021-08-27", "2021-08-27",
"2021-08-30", "2021-08-30", "2021-08-31", "2021-09-01", "2021-09-03",
"2021-09-03", "2021-09-15", "2021-09-16", "2021-09-20", "2021-09-22",
"2021-09-23", "2021-09-23", "2021-09-24", "2021-09-24", "2021-10-01",
"2021-10-04", "2021-10-06", "2021-10-08", "2021-10-08", "2021-10-08",
"2021-10-11", "2021-10-14", "2021-10-19", "2021-10-20", "2021-10-21",
"2021-10-22", "2021-10-22", "2021-10-29", "2021-11-02", "2021-11-03",
"2021-11-08", "2021-11-09", "2021-11-16", "2021-11-16", "2021-11-17"
)), row.names = c(229L, 8247L, 3068L, 7222L, 3746L, 3912L, 8019L,
3610L, 6078L, 6085L, 6271L, 6284L, 6285L, 6310L, 6321L, 6335L,
6336L, 3697L, 9149L, 8217L, 3734L, 220L, 6729L, 5562L, 7729L,
7933L, 5291L, 7232L, 1647L, 7335L, 3418L, 7189L, 2912L, 7790L,
6088L, 6247L, 6281L, 6338L, 7608L, 6614L, 410L, 2746L, 8296L,
3117L, 177L, 2788L, 3301L, 6221L, 5173L, 2092L, 3577L, 6219L,
6973L, 9020L, 1274L, 1768L, 8218L, 1822L, 2499L, 8107L, 1910L,
4756L, 2739L, 7342L, 7857L, 6519L, 2104L, 3666L, 7506L, 2635L,
3402L, 5566L, 2637L, 3036L, 2976L, 3871L, 8376L, 3112L, 4772L,
6449L, 8200L, 8445L, 3310L, 4005L, 3219L, 8241L, 8266L, 2995L,
3273L, 8401L, 3336L, 3118L, 2272L, 3333L, 3370L, 3952L, 7339L
), class = "data.frame")
library(dplyr)
df_without_undesired_dates <- x %>%
filter(!`Termination Date` %in% dates)

Related

Combined Boxplots in R

I want to draw the same exact graph in R. However, I want to consider two options:
(1) with one x axis for each of the genders &
(2) two different xaxes for each of the gender. Here is also the link for where I found the image: https://rpubs.com/WhataBurger/Anovatype3
Thanks for sharing the knowledge.
Here is a randomly generated one. Please feel free to share your random data in the responses (if you have any).
Show in New Window
structure(list(gender = c("Male", "Male", "Male", "Male", "Male",
"Male", "Male", "Male", "Male", "Male", "Male", "Male", "Male",
"Male", "Male", "Male", "Male", "Male", "Male", "Male", "Male",
"Male", "Male", "Male", "Male", "Male", "Male", "Male", "Male",
"Male", "Male", "Male", "Male", "Male", "Male", "Male", "Male",
"Male", "Male", "Male", "Male", "Male", "Male", "Male", "Male",
"Male", "Male", "Male", "Male", "Male", "Female", "Female", "Female",
"Female", "Female", "Female", "Female", "Female", "Female", "Female",
"Female", "Female", "Female", "Female", "Female", "Female", "Female",
"Female", "Female", "Female", "Female", "Female", "Female", "Female",
"Female", "Female", "Female", "Female", "Female", "Female", "Female",
"Female", "Female", "Female", "Female", "Female", "Female", "Female",
"Female", "Female", "Female", "Female", "Female", "Female", "Female",
"Female", "Female", "Female", "Female", "Female"), education = c("Education",
"Education", "Education", "Education", "Education", "Education",
"Education", "Education", "Education", "Education", "Education",
"Education", "Education", "Education", "Education", "Education",
"Education", "Education", "Education", "Education", "Education",
"Education", "Education", "Education", "Education", "Education",
"Education", "Education", "Education", "Education", "Education",
"Education", "Education", "Education", "Education", "Education",
"Education", "Education", "Education", "Education", "Education",
"Education", "Education", "Education", "Education", "Education",
"Education", "Education", "Education", "Education", "No Education",
"No Education", "No Education", "No Education", "No Education",
"No Education", "No Education", "No Education", "No Education",
"No Education", "No Education", "No Education", "No Education",
"No Education", "No Education", "No Education", "No Education",
"No Education", "No Education", "No Education", "No Education",
"No Education", "No Education", "No Education", "No Education",
"No Education", "No Education", "No Education", "No Education",
"No Education", "No Education", "No Education", "No Education",
"No Education", "No Education", "No Education", "No Education",
"No Education", "No Education", "No Education", "No Education",
"No Education", "No Education", "No Education", "No Education",
"No Education", "No Education", "No Education", "No Education",
"No Education"), salary = c(54395.2435344779, 57698.2251051672,
75587.0831414912, 60705.0839142458, 61292.8773516095, 77150.6498688328,
64609.162059892, 47349.3876539347, 53131.4714810647, 55543.3802990004,
72240.8179743946, 63598.1382705736, 64007.7145059405, 61106.8271594512,
54441.5886524592, 77869.1313680308, 64978.5047822924, 40333.8284337036,
67013.5590156369, 55272.0859227207, 49321.7629401315, 57820.250853417,
49739.9555169276, 52711.0877070886, 53749.6073215074, 54395.2435344779,
57698.2251051672, 75587.0831414912, 60705.0839142458, 61292.8773516095,
77150.6498688328, 64609.162059892, 47349.3876539347, 53131.4714810647,
55543.3802990004, 72240.8179743946, 63598.1382705736, 64007.7145059405,
61106.8271594512, 54441.5886524592, 77869.1313680308, 64978.5047822924,
40333.8284337036, 67013.5590156369, 55272.0859227207, 49321.7629401315,
57820.250853417, 49739.9555169276, 52711.0877070886, 53749.6073215074,
23253.2267570303, 33351.1481779781, 30613.4924713461, 25447.4522519522,
35015.2596842797, 31705.8568859073, 28819.7140680309, 33580.5026441801,
33512.5339501322, 33286.3243265499, 32754.5610164004, 32215.6706141504,
29752.3531576931, 28776.1493450403, 28478.1159959505, 27221.172084318,
29168.3308879216, 24938.4145937269, 38675.8238613541, 34831.84799322,
25507.5656671866, 28388.4606588037, 28133.3785855071, 33119.8604733453,
29666.5237341127, 23253.2267570303, 33351.1481779781, 30613.4924713461,
25447.4522519522, 35015.2596842797, 31705.8568859073, 28819.7140680309,
33580.5026441801, 33512.5339501322, 33286.3243265499, 32754.5610164004,
32215.6706141504, 29752.3531576931, 28776.1493450403, 28478.1159959505,
27221.172084318, 29168.3308879216, 24938.4145937269, 38675.8238613541,
34831.84799322, 25507.5656671866, 28388.4606588037, 28133.3785855071,
33119.8604733453, 29666.5237341127)), class = "data.frame", row.names = c(NA,
-100L))
Look at this code, it may help you to start. Your data it's not complete as all Education are male and all No Education are female, so you can't get a facet_wrap() with all categories. Anyway, I think this may be of help.
Once your variables charged, make a dataframe and analyse with ggplot:
library (ggplot2)
df <- data. Frame(education, gender, salary)
# plot 1
ggplot(df, aes(x = education, y = salary, fill=gender)) +
geom_boxplot() +
facet_wrap(.~gender) +
theme_bw()
# plot 2
ggplot(df, aes(x = education, y = salary, fill = gender)) +
geom_boxplot() +
theme_bw()

Generate random pairs of factors in dataframe using R

I have a character vector data frame and I would like to randomly generate pairs of names coming from this vector. My code gives the all combinations. But I want to generate all names should be paired with one time in random order; an item cannot be partner with itself.
My code is:
# Creating a dataframe
df = data.frame(
"Name" = c("Amiya", "Raj", "Asish", "John", "ruban", "mary", "barath", "leema", "joshi", "indhu", "praveen", "joshua",
"alex", "martin", "stella", "veronica", "henry", "rajesh", "yusuf", "jenita", "johana", "jerald", "jegan", "lincy",
"jona", "rani", "julie", "ross", "chandler", "monica", "penny", "sheldon"),
"Sex" = c("Female", "Male", "Male", "male", "male", "Female", "male", "Female", "Female", "Female", "male",
"male", "male", "male", "Female", "Female", "male", "male", "male", "Female", "Female", "male",
"male", "Female", "Female", "Female", "Female", "male", "male", "Female", "Female", "male"),
"Number" = c(8937998889, 2598279874, 4589987483, 2876876877, 2876876876, 2487698798, 2879879877, 2887987897, 2878798733,
4309808098, 8748098990, 9883798798, 8734787987, 8973498787, 8734887877, 9798374877, 8786487687, 7275687263,
4379879847, 8943787876, 3874879874, 8978973987, 8978347878, 8839478768, 9378887774, 8467676764, 7246276874,
7478798743, 6576787877, 7328776876, 6648678833, 6378787878)
)
print(df)
# Accessing first and second column
cat("Accessing first and second column\n")
dat <- print(df[, 1])
t(combn(unique(dat,2)))
TIA
Get the unique elements from 'Name' column, sample it and convert to a matrix with 2 columns (assuming the length of unique elements are even)
matrix(sample(unique(df$Name)), ncol = 2)

circlize::circos.heatmap Add gaps between cells

I am creating a circular heatmap as follows:
suppressPackageStartupMessages({
library(circlize)
})
# input data
dput(annot)
structure(list(Specimen_Type = c("Both", "Plasma", "Both", "Both",
"Plasma", "Plasma", "Plasma", "Both", "Both", "Both", "Plasma",
"Plasma", "Both", "Both", "Both", "Both", "Both", "Both", "Both",
"Both", "Both", "Plasma", "Both", "Both", "Plasma", "Both", "Plasma",
"Plasma", "Both", "Plasma", "Both", "CSF", "Both", "Plasma",
"Both", "Both", "Both", "Plasma", "Both", "Plasma", "Both", "Plasma",
"Plasma", "Both", "Both", "Plasma", "Both", "Both", "Plasma",
"Plasma", "Plasma", "Plasma", "Plasma", "Both", "Both"), Sex = c("Female",
"Female", "Female", "Male", "Female", "Female", "Female", "Female",
"Female", "Female", "Male", "Male", "Male", "Male", "Male", "Male",
"Female", "Female", "Male", "Male", "Female", "Female", "Male",
"Male", "Female", "Male", "Male", "Male", "Female", "Female",
"Male", "Male", "Female", "Female", "Male", "Male", "Female",
"Male", "Male", "Male", "Male", "Female", "Male", "Male", "Female",
"Female", "Male", "Male", "Female", "Male", "Male", "Male", "Female",
"Female", "Female")), row.names = c("15635-29", "15635-31", "15635-32",
"15635-37", "15635-38", "15635-182", "15635-42", "15635-43",
"15635-45", "15635-46", "15635-53", "15635-215", "15635-58",
"15635-60", "15635-63", "15635-68", "15635-70", "15635-75", "15635-80",
"15635-81", "15635-87", "15635-90", "15635-100", "15635-101",
"15635-108", "15635-120", "15635-127", "15635-129", "15635-132",
"15635-134", "15635-135", "15635-1", "15635-2", "15635-251",
"15635-7", "15635-11", "15635-145", "15635-148", "15635-150",
"15635-154", "15635-156", "15635-158", "15635-161", "15635-169",
"15635-170", "15635-187", "15635-197", "15635-214", "15635-228",
"15635-225", "15635-246", "15635-254", "15635-234", "15635-239",
"15635-279"), class = "data.frame")
split <- factor(annot$Specimen_Type)
col_fun1 <- list("Male" = "navy",
"Female" = "deeppink4",
'Plasma' = '#fcff5c',
'CSF' = '#8d14ff',
'Both' = '#14f9ff')
circos.par(start.degree = 30, gap.degree = 1, points.overflow.warning = FALSE)
circos.heatmap(annot,
split = split,
col = unlist(col_fun1),
track.height = 0.4,
bg.border = "gray50", bg.lty = 1.5,
show.sector.labels = T)
circos.clear()
How do I add gaps between individual cells in the heatmap?
I needed to update the circlize package and add cell.border = "white" param.

Removed N rows containing missing values BUT there are no missing values nor values out of range

I posted a similar question a week ago but I failed to identify the real problem. Therefore, the question was far from being correct.
Now, I clearly now what is going on but I cannot understand why it is happening. I also reviewed similar problems related with the same error but the solutions for these problems were not applicable to my case.
I am plotting the frequency distribution of a variable during the fieldwork progress of a survey. Therefore, it shows how the proportion of that variables has changed through time.
So, I have a variable (Startday) that tells which day the respondent took the survey, if he/she did not then it is NA. Then, I have the typical variables like sex or marital status.
This is the code to plot such graph
df %>%
mutate(date = lubridate::mdy(startday)) %>%
arrange(date) %>%
mutate(Rs = cumsum(sf_sex %in% c("Male", "Female")),
female_Rs = cumsum(sf_sex == "Female")) %>%
group_by(date) %>%
slice(n()) %>%
select(date, Rs, female_Rs) %>%
mutate(female_prop = female_Rs/Rs) %>%
ggplot(aes(x = date, y = female_prop)) +
geom_point() +
geom_line()
And this is what I get.
Exactly what I want. The problem comes when I am using Marital status as a variable (and that variable has the same nature than the other: dummy and character). This is what I get using the following code:
df %>%
mutate(date = lubridate::mdy(startday)) %>%
arrange(date) %>%
mutate(Rs = cumsum(Maritaldummy %in% c("Not married", "Married")),
Married_Rs = cumsum(Maritaldummy == "Married")) %>%
group_by(date) %>%
slice(n()) %>%
select(date, Rs, Married_Rs) %>%
mutate(Married_prop = Married_Rs/Rs) %>%
ggplot(aes(x = date, y = Married_prop)) +
geom_point() +
geom_line()
Followed by this error:
Warning messages:
1: Removed 34 rows containing missing values (geom_point).
2: Removed 34 row(s) containing missing values (geom_path).
As you can see the observations stop around the 5th of June.
Things to consider:
It is not out of range as I tried change the graph's range using ylim() and xlim()
There are no missing values
The strange part comes when this code works for experimental groups 2 and 3 (n = 350 each one) but not for experimental group 1 (n= 2050). I do believe the error has to come from here as when I random sample less than 1300 observations for group 1... it works!!! This is an example of the same code for group 2.
I am giving you a reproducible example but I am afraid the error only works when using it with the full sample but maybe you discover what is wrong with it?
Thanks a lot for the attention, time & help.
df <- structure(list(startday = c("06/02/2019", "05/22/2019", "05/28/2019",
"05/26/2019", "06/03/2019", "06/10/2019", "05/22/2019", "05/30/2019",
"05/31/2019", "06/18/2019", "05/22/2019", "05/25/2019", "05/25/2019",
"05/22/2019", "06/14/2019", "06/14/2019", "05/20/2019", "05/27/2019",
"05/20/2019", "05/21/2019", "05/20/2019", "05/20/2019", "06/09/2019",
"06/12/2019", "05/24/2019", "05/20/2019", "05/20/2019", "05/28/2019",
"06/09/2019", "05/20/2019", "06/21/2019", "06/03/2019", "06/07/2019",
"05/26/2019", "05/28/2019", "06/03/2019", "06/06/2019", "06/05/2019",
"05/27/2019", "06/10/2019", "05/20/2019", "06/05/2019", "05/20/2019",
"06/04/2019", "05/23/2019", "05/20/2019", "06/11/2019", "05/28/2019",
"06/09/2019", "06/15/2019", "05/25/2019", "06/14/2019", "05/20/2019",
"06/05/2019", "06/04/2019", "06/10/2019", "06/16/2019", "06/05/2019",
"06/29/2019", "05/30/2019", "06/03/2019", "06/09/2019", "05/20/2019",
"05/25/2019", "06/16/2019", "06/14/2019", "05/21/2019", "05/28/2019",
"06/09/2019", "06/07/2019", "05/25/2019", "05/20/2019", "05/27/2019",
"05/20/2019", "05/21/2019", "05/20/2019", "06/17/2019", "06/26/2019",
"06/07/2019", "05/22/2019", "06/19/2019", "06/04/2019", "05/21/2019",
"05/21/2019", "05/21/2019", "06/14/2019", "05/25/2019", "06/19/2019",
"05/20/2019", "06/03/2019", "05/20/2019", "06/04/2019", "05/20/2019",
"05/27/2019", "05/22/2019", "05/20/2019", "06/02/2019", "05/21/2019",
"05/23/2019", "06/03/2019", "06/14/2019", "06/14/2019", "06/07/2019",
"05/20/2019", "05/23/2019", "06/24/2019", "06/03/2019", "05/20/2019",
"06/06/2019", "06/15/2019", "06/06/2019", "05/27/2019", "05/24/2019",
"05/22/2019", "05/20/2019", "05/30/2019", "06/23/2019", "05/21/2019",
"05/20/2019", "06/16/2019", "05/20/2019", "05/24/2019", "05/21/2019",
"05/21/2019", "06/20/2019", "05/20/2019", "05/22/2019", "06/06/2019",
"05/20/2019", "05/21/2019", "06/15/2019", "05/27/2019", "05/26/2019",
"06/06/2019", "05/20/2019", "06/05/2019", "06/02/2019", "06/20/2019",
"05/22/2019", "05/20/2019", "06/03/2019", "05/20/2019", "06/03/2019",
"05/20/2019", "06/03/2019", "05/22/2019", "05/20/2019", "05/22/2019",
"05/22/2019", "05/20/2019", "05/20/2019", "05/23/2019", "05/23/2019",
"05/23/2019", "06/05/2019", "06/08/2019", "06/03/2019", "05/24/2019",
"06/05/2019", "06/02/2019", "05/20/2019", "05/29/2019", "06/04/2019",
"05/21/2019", "06/08/2019", "06/12/2019", "05/30/2019", "06/05/2019",
"06/12/2019", "05/20/2019", "05/20/2019", "06/26/2019", "05/20/2019",
"06/04/2019", "05/20/2019", "06/06/2019", "05/24/2019", "05/24/2019",
"06/06/2019", "06/22/2019", "05/26/2019", "05/29/2019", "05/27/2019",
"05/20/2019", "05/23/2019", "05/21/2019", "05/22/2019", "05/22/2019",
"06/11/2019", "06/05/2019", "06/05/2019", "05/28/2019", "05/23/2019",
"06/13/2019", "05/20/2019", "06/07/2019", "05/28/2019", "06/12/2019",
"06/28/2019", "06/15/2019"), sf_sex = c("Female", "Male", "Male",
"Male", "Male", "Female", "Female", "Female", "Female", "Female",
"Female", "Male", "Female", "Male", "Female", "Female", "Female",
"Male", "Female", "Female", "Male", "Male", "Female", "Male",
"Male", "Female", "Male", "Female", "Female", "Male", "Male",
"Male", "Female", "Female", "Male", "Male", "Female", "Male",
"Female", "Male", "Female", "Female", "Female", "Male", "Male",
"Female", "Male", "Male", "Male", "Female", "Male", "Female",
"Male", "Male", "Male", "Female", "Female", "Female", "Female",
"Male", "Female", "Male", "Male", "Female", "Female", "Male",
"Male", "Male", "Male", "Female", "Male", "Male", "Female", "Female",
"Male", "Male", "Male", "Male", "Female", "Female", "Male", "Male",
"Female", "Male", "Male", "Male", "Female", "Female", "Female",
"Female", "Male", "Female", "Female", "Female", "Male", "Female",
"Female", "Female", "Male", "Female", "Female", "Female", "Female",
"Female", "Female", "Female", "Male", "Female", "Male", "Male",
"Female", "Male", "Female", "Female", "Male", "Female", "Male",
"Male", "Female", "Female", "Female", "Male", "Female", "Female",
"Male", "Female", "Male", "Female", "Female", "Male", "Female",
"Female", "Male", "Female", "Male", "Male", "Female", "Female",
"Female", "Female", "Female", "Male", "Female", "Female", "Female",
"Female", "Female", "Male", "Female", "Male", "Female", "Female",
"Female", "Female", "Female", "Female", "Female", "Male", "Male",
"Male", "Female", "Female", "Female", "Female", "Female", "Male",
"Male", "Female", "Female", "Female", "Male", "Female", "Male",
"Female", "Male", "Female", "Female", "Male", "Female", "Male",
"Male", "Female", "Male", "Female", "Female", "Male", "Female",
"Female", "Male", "Female", "Female", "Female", "Male", "Male",
"Male", "Female", "Female", "Female", "Female", "Male"), Maritaldummy = c("Not married",
"Married", "Married", "Not married", "Not married", "Married",
"Married", "Married", "Not married", "Not married", "Not married",
"Married", "Married", "Married", "Married", "Married", "Not married",
"Not married", "Not married", "Married", "Not married", "Not married",
"Not married", "Not married", "Not married", "Married", "Married",
"Not married", "Married", "Not married", "Married", "Not married",
"Not married", "Not married", "Not married", "Not married", "Married",
"Not married", "Married", "Married", "Not married", "Not married",
"Married", "Not married", "Married", "Not married", "Not married",
"Not married", "Married", "Married", "Married", "Not married",
"Not married", "Married", "Married", "Not married", "Not married",
"Married", "Married", "Not married", "Married", "Married", "Married",
"Not married", "Married", "Not married", "Not married", "Married",
"Not married", "Married", "Not married", "Not married", "Not married",
"Married", "Not married", "Not married", "Married", "Married",
"Not married", "Married", "Married", "Married", "Married", "Married",
"Married", "Not married", "Married", "Not married", "Not married",
"Not married", "Not married", "Not married", "Married", "Not married",
"Married", "Married", "Not married", "Not married", "Married",
"Not married", "Married", "Married", "Married", "Married", "Not married",
"Married", "Married", "Married", "Not married", "Married", "Not married",
"Not married", "Married", "Not married", "Married", "Not married",
"Not married", "Married", "Not married", "Married", "Not married",
"Married", "Married", "Not married", "Married", "Married", "Married",
"Not married", "Married", "Married", "Married", "Married", "Married",
"Married", "Married", "Married", "Not married", "Not married",
"Not married", "Married", "Married", "Married", "Not married",
"Married", "Not married", "Married", "Not married", "Married",
"Married", "Married", "Married", "Married", "Not married", "Married",
"Not married", "Not married", "Married", "Married", "Married",
"Married", "Married", "Married", "Married", "Married", "Not married",
"Married", "Married", "Married", "Not married", "Not married",
"Married", "Not married", "Married", "Not married", "Married",
"Married", "Not married", "Not married", "Married", "Not married",
"Married", "Not married", "Not married", "Married", "Not married",
"Not married", "Married", "Married", "Married", "Not married",
"Not married", "Not married", "Married", "Married", "Married",
"Married", "Not married", "Not married", "Married", "Not married")), row.names = c("3564", "2999", "20144", "17281", "11917",
"14549", "5116", "10553", "23108", "19521", "277", "24312", "5449",
"19006", "9171", "21265", "20494", "11961", "15556", "12237",
"10959", "23460", "14050", "13996", "16222", "21852", "5593",
"18871", "18770", "776", "24913", "7813", "25079", "1063", "22878",
"13638", "19169", "7226", "14895", "8088", "19789", "22835",
"14196", "13816", "7124", "10394", "8290", "16807", "732", "3130",
"16033", "14958", "7500", "15039", "1538", "12532", "2890", "18907",
"21581", "3120", "20198", "22943", "8468", "3128", "24153", "22911",
"6225", "8489", "13040", "17506", "14855", "1500", "11955", "24484",
"17625", "19888", "10351", "19210", "22946", "14699", "1959",
"6770", "23286", "11842", "12811", "22197", "5899", "10138",
"20505", "16090", "17835", "20512", "12271", "9152", "12767",
"25244", "16865", "6970", "10036", "22531", "12329", "15366",
"2", "9440", "2100", "23166", "11421", "18912", "4441", "25202",
"20599", "411", "12584", "1586", "4543", "1307", "10044", "25033",
"5005", "25122", "16236", "9653", "16194", "14393", "7512", "10059",
"12010", "1619", "3136", "24088", "14641", "19564", "9568", "18815",
"21079", "22010", "9553", "20380", "20416", "15745", "7000",
"7735", "24924", "15286", "20403", "4680", "13714", "13302",
"12508", "17514", "4480", "7446", "3723", "24069", "25317", "14607",
"12274", "21715", "8983", "23488", "9228", "7265", "18192", "16475",
"11760", "15530", "18177", "11535", "18839", "17908", "9789",
"18045", "1025", "21645", "11853", "22453", "18052", "22763",
"9", "12286", "15329", "3306", "13215", "16533", "18385", "23784",
"10131", "4894", "14154", "3365", "8648", "17325", "21219", "16689",
"9969", "10621", "24206", "19621", "8440", "19889"), class = "data.frame")
We can reproduce the error if you change any one value to NA in the column.
library(dplyr)
library(ggplot2)
df$Maritaldummy[195] <- NA
df %>%
mutate(date = lubridate::mdy(startday)) %>%
arrange(date) %>%
mutate(Rs = cumsum(Maritaldummy %in% c("Not married", "Married")),
Married_Rs = cumsum(Maritaldummy == "Married")) %>%
group_by(date) %>%
slice(n()) %>%
select(date, Rs, Married_Rs) %>%
mutate(Married_prop = Married_Rs/Rs) %>%
ggplot(aes(x = date, y = Married_prop)) +
geom_point() +
geom_line()
Returns
Warning messages:
1: Removed 38 rows containing missing values (geom_point).
2: Removed 38 row(s) containing missing values (geom_path).
Since one or more than one value is NA cumsum fails and returns NA for all the values after that. An easy fix is to use %in% instead of == which returns FALSE when compared to NA.
df %>%
mutate(date = lubridate::mdy(startday)) %>%
arrange(date) %>%
mutate(Rs = cumsum(Maritaldummy %in% c("Not married", "Married")),
Married_Rs = cumsum(Maritaldummy %in% "Married")) %>%
group_by(date) %>%
slice(n()) %>%
select(date, Rs, Married_Rs) %>%
mutate(Married_prop = Married_Rs/Rs) %>%
ggplot(aes(x = date, y = Married_prop)) +
geom_point() +
geom_line()

X- axis labels are not properly aligned in R barplot

I have data.table data to create a stacked chart and with grouping using below code:
causesDf <- causesDf[, c('Type', 'Gender', 'Total')]
causesSort <- causesDf[, lapply(.SD, sum),
by=list(causesDf$Type, causesDf$Gender)]
and Data will be like below:
causesDf causesDf.1 Total
1: Illness (Aids/STD) Female 2892
2: Change in Economic Status Female 4235
3: Cancellation/Non-Settlement of Marriage Female 6126
4: Family Problems Female 133181
5: Illness (Aids/STD) Male 5831
6: Change in Economic Status Male 31175
7: Cancellation/Non-Settlement of Marriage Male 5170
and so on..
I am trying to make barplot like below:
barpos <- barplot(sort(causesSort$Total, decreasing=TRUE),
col=c("red","green"), xlab="", ylab="",
horiz=FALSE, las=2)
legend("topright", c("Male","Female"), fill=c("red","green"))
end_point <- 0.2 + nrow(causesSort) + nrow(causesSort) - 0.1
text(seq(0.1, end_point, by=1), par("usr")[3] - 30,
srt=60, adj= 1, xpd=TRUE,
labels=paste(causesSort$causesDf), cex=0.65)
but X-labels are not aligning properly, did I miss anything?
Expected output like:
Edited:
causesSort
structure(list(causesDf = c("Illness (Aids/STD)", "Change in Economic Status",
"Cancellation/Non-Settlement of Marriage", "Physical Abuse (Rape/Incest Etc.)",
"Dowry Dispute", "Family Problems", "Ideological Causes/Hero Worshipping",
"Other Prolonged Illness", "Property Dispute", "Fall in Social Reputation",
"Illegitimate Pregnancy", "Failure in Examination", "Insanity/Mental Illness",
"Love Affairs", "Professional/Career Problem", "Divorce", "Drug Abuse/Addiction",
"Not having Children(Barrenness/Impotency", "Causes Not known",
"Unemployment", "Poverty", "Death of Dear Person", "Cancer",
"Suspected/Illicit Relation", "Paralysis", "Property Dispute",
"Unemployment", "Poverty", "Family Problems", "Illness (Aids/STD)",
"Drug Abuse/Addiction", "Other Prolonged Illness", "Death of Dear Person",
"Causes Not known", "Cancer", "Not having Children(Barrenness/Impotency",
"Cancellation/Non-Settlement of Marriage", "Paralysis", "Physical Abuse (Rape/Incest Etc.)",
"Professional/Career Problem", "Love Affairs", "Fall in Social Reputation",
"Dowry Dispute", "Ideological Causes/Hero Worshipping", "Illegitimate Pregnancy",
"Failure in Examination", "Change in Economic Status", "Insanity/Mental Illness",
"Divorce", "Suspected/Illicit Relation", "Not having Children (Barrenness/Impotency",
"Not having Children (Barrenness/Impotency"), causesDf.1 = c("Female",
"Female", "Female", "Female", "Female", "Female", "Female", "Female",
"Female", "Female", "Female", "Female", "Female", "Female", "Female",
"Female", "Female", "Female", "Female", "Female", "Female", "Female",
"Female", "Female", "Female", "Male", "Male", "Male", "Male",
"Male", "Male", "Male", "Male", "Male", "Male", "Male", "Male",
"Male", "Male", "Male", "Male", "Male", "Male", "Male", "Male",
"Male", "Male", "Male", "Male", "Male", "Female", "Male"), Total = c(2892,
4235, 6126, 2662, 31206, 133181, 776, 69072, 4601, 4697, 2391,
12054, 33352, 21339, 1596, 2535, 1205, 5523, 148134, 3748, 7905,
4707, 2878, 8093, 2284, 14051, 23617, 24779, 208771, 5831, 28841,
125493, 5614, 304985, 6180, 2299, 5170, 5002, 1330, 10958, 23700,
8767, 764, 1342, 103, 14951, 31175, 60877, 1598, 6818, 544, 222
)), row.names = c(NA, -52L), class = c("data.table", "data.frame"
)
# , .internal.selfref = <pointer: 0x00000000098d1ef0> # seems not to work
)
If you don't rely on 45° rotation (that one is a bit more tricky) you could use this solution.
First we need to reshape the data by sex.
library(reshape2)
df2 <- dcast(causesSort, ... ~ causesDf.1 , value.var="Total")
Then we generate rownames from the type column and delete this column.
rownames(df2) <- df2[, 1]
df2 <- df2[, -1]
Then we order the data by one column, e.g. by Female.
df2 <- df2[order(-df2$Female), ]
The labels are the rownames.
# labs <- rownames(df2)
However, since they are very long (and bad for the reader's eye!), we may have to think of shorter ones. A workaround is to shorten them a little.
labs <- substr(sapply(strsplit(rownames(df2), " "),
function(x) x[1]), 1, 8)
Now we are able to apply barplot().
pos <- barplot(t(df2), beside=TRUE, xaxt="n",
col=c("#3C6688", "#45A778"), border="white")
pos gives us a matrix of bar positions, because we have a grouped plot we need the column means. We can use it to plot the axis.
axis(1, colMeans(pos), labs, las=2)
Result
Here is ggplot2 solution. This may provide better control over the final output
library(dplyr)
library(ggplot2)
#Rename columns names
names(causesDf) <- c('Type', 'Gender', 'Total')
#sort male before females
causesDf$Gender<-factor(causesDf$Gender, levels=c("Male", "Female"), ordered=TRUE)
#sort types by total sum and sort in decreasing order
sorted<-causesDf %>% group_by(Type) %>% summarize(gtotal=sum(Total)) %>% arrange(desc(gtotal))
causesDf$Type<-factor(causesDf$Type, levels=sorted$Type, ordered=TRUE)
#plot graph
g<-ggplot(causesDf, aes(x=Type, y=Total, group=Gender, fill=Gender)) +
geom_col(position = "dodge") +
theme(axis.text.x = element_text(angle = 45, hjust=1)) +
scale_fill_manual(values = alpha(c("blue", "green"), .5))
print(g)

Resources