Generate random pairs of factors in dataframe using R - r

I have a character vector data frame and I would like to randomly generate pairs of names coming from this vector. My code gives the all combinations. But I want to generate all names should be paired with one time in random order; an item cannot be partner with itself.
My code is:
# Creating a dataframe
df = data.frame(
"Name" = c("Amiya", "Raj", "Asish", "John", "ruban", "mary", "barath", "leema", "joshi", "indhu", "praveen", "joshua",
"alex", "martin", "stella", "veronica", "henry", "rajesh", "yusuf", "jenita", "johana", "jerald", "jegan", "lincy",
"jona", "rani", "julie", "ross", "chandler", "monica", "penny", "sheldon"),
"Sex" = c("Female", "Male", "Male", "male", "male", "Female", "male", "Female", "Female", "Female", "male",
"male", "male", "male", "Female", "Female", "male", "male", "male", "Female", "Female", "male",
"male", "Female", "Female", "Female", "Female", "male", "male", "Female", "Female", "male"),
"Number" = c(8937998889, 2598279874, 4589987483, 2876876877, 2876876876, 2487698798, 2879879877, 2887987897, 2878798733,
4309808098, 8748098990, 9883798798, 8734787987, 8973498787, 8734887877, 9798374877, 8786487687, 7275687263,
4379879847, 8943787876, 3874879874, 8978973987, 8978347878, 8839478768, 9378887774, 8467676764, 7246276874,
7478798743, 6576787877, 7328776876, 6648678833, 6378787878)
)
print(df)
# Accessing first and second column
cat("Accessing first and second column\n")
dat <- print(df[, 1])
t(combn(unique(dat,2)))
TIA

Get the unique elements from 'Name' column, sample it and convert to a matrix with 2 columns (assuming the length of unique elements are even)
matrix(sample(unique(df$Name)), ncol = 2)

Related

circlize::circos.heatmap Add gaps between cells

I am creating a circular heatmap as follows:
suppressPackageStartupMessages({
library(circlize)
})
# input data
dput(annot)
structure(list(Specimen_Type = c("Both", "Plasma", "Both", "Both",
"Plasma", "Plasma", "Plasma", "Both", "Both", "Both", "Plasma",
"Plasma", "Both", "Both", "Both", "Both", "Both", "Both", "Both",
"Both", "Both", "Plasma", "Both", "Both", "Plasma", "Both", "Plasma",
"Plasma", "Both", "Plasma", "Both", "CSF", "Both", "Plasma",
"Both", "Both", "Both", "Plasma", "Both", "Plasma", "Both", "Plasma",
"Plasma", "Both", "Both", "Plasma", "Both", "Both", "Plasma",
"Plasma", "Plasma", "Plasma", "Plasma", "Both", "Both"), Sex = c("Female",
"Female", "Female", "Male", "Female", "Female", "Female", "Female",
"Female", "Female", "Male", "Male", "Male", "Male", "Male", "Male",
"Female", "Female", "Male", "Male", "Female", "Female", "Male",
"Male", "Female", "Male", "Male", "Male", "Female", "Female",
"Male", "Male", "Female", "Female", "Male", "Male", "Female",
"Male", "Male", "Male", "Male", "Female", "Male", "Male", "Female",
"Female", "Male", "Male", "Female", "Male", "Male", "Male", "Female",
"Female", "Female")), row.names = c("15635-29", "15635-31", "15635-32",
"15635-37", "15635-38", "15635-182", "15635-42", "15635-43",
"15635-45", "15635-46", "15635-53", "15635-215", "15635-58",
"15635-60", "15635-63", "15635-68", "15635-70", "15635-75", "15635-80",
"15635-81", "15635-87", "15635-90", "15635-100", "15635-101",
"15635-108", "15635-120", "15635-127", "15635-129", "15635-132",
"15635-134", "15635-135", "15635-1", "15635-2", "15635-251",
"15635-7", "15635-11", "15635-145", "15635-148", "15635-150",
"15635-154", "15635-156", "15635-158", "15635-161", "15635-169",
"15635-170", "15635-187", "15635-197", "15635-214", "15635-228",
"15635-225", "15635-246", "15635-254", "15635-234", "15635-239",
"15635-279"), class = "data.frame")
split <- factor(annot$Specimen_Type)
col_fun1 <- list("Male" = "navy",
"Female" = "deeppink4",
'Plasma' = '#fcff5c',
'CSF' = '#8d14ff',
'Both' = '#14f9ff')
circos.par(start.degree = 30, gap.degree = 1, points.overflow.warning = FALSE)
circos.heatmap(annot,
split = split,
col = unlist(col_fun1),
track.height = 0.4,
bg.border = "gray50", bg.lty = 1.5,
show.sector.labels = T)
circos.clear()
How do I add gaps between individual cells in the heatmap?
I needed to update the circlize package and add cell.border = "white" param.

filter DataFrame to exclude specific dates

I want to know how to filter a DataFrame to exclude specific and discrete dates.
# Input Data
dates = c("2021-03-31", "2021-05-02", "2021-06-30", "2021-10-22")
dates = as.Date(dates)
x = structure(list(Gender = c("Male", "Female", "Male", "Male", "Female",
"Male", "Female", "Female", "Female", "Female", "Female", "Female",
"Female", "Female", "Female", "Female", "Male", "Male", "Female",
"Female", "Female", "Male", "Female", "Female", "Male", "Female",
"Male", "Female", "Female", "Female", "Male", "Male", "Female",
"Female", "Female", "Female", "Female", "Female", "Female", "Male",
"Female", "Male", "Female", "Male", "Female", "Female", "Female",
"Male", "Male", "Female", "Female", "Female", "Male", "Male",
"Female", "Female", "Female", "Male", "Female", "Male", "Female",
"Male", "Female", "Female", "Female", "Female", "Female", "Female",
"Female", "Female", "Male", "Female", "Female", "Female", "Female",
"Female", "Male", "Male", "Female", "Male", "Female", "Female",
"Male", "Female", "Female", "Female", "Female", "Female", "Male",
"Female", "Female", "Male", "Female", "Female", "Female", "Female",
"Female"), `Termination Date` = c("2021-01-05", "2021-02-12",
"2021-02-22", "2021-02-24", "2021-03-12", "2021-03-12", "2021-03-24",
"2021-03-26", "2021-03-31", "2021-03-31", "2021-03-31", "2021-03-31",
"2021-03-31", "2021-03-31", "2021-03-31", "2021-03-31", "2021-03-31",
"2021-04-02", "2021-04-02", "2021-04-05", "2021-04-09", "2021-04-30",
"2021-05-05", "2021-05-11", "2021-05-11", "2021-05-14", "2021-05-21",
"2021-05-21", "2021-05-24", "2021-06-01", "2021-06-11", "2021-06-11",
"2021-06-14", "2021-06-24", "2021-06-27", "2021-06-27", "2021-06-27",
"2021-06-27", "2021-07-02", "2021-07-07", "2021-07-23", "2021-07-26",
"2021-07-26", "2021-07-27", "2021-07-30", "2021-08-02", "2021-08-06",
"2021-08-06", "2021-08-09", "2021-08-11", "2021-08-13", "2021-08-13",
"2021-08-13", "2021-08-13", "2021-08-16", "2021-08-18", "2021-08-20",
"2021-08-23", "2021-08-24", "2021-08-25", "2021-08-27", "2021-08-27",
"2021-08-30", "2021-08-30", "2021-08-31", "2021-09-01", "2021-09-03",
"2021-09-03", "2021-09-15", "2021-09-16", "2021-09-20", "2021-09-22",
"2021-09-23", "2021-09-23", "2021-09-24", "2021-09-24", "2021-10-01",
"2021-10-04", "2021-10-06", "2021-10-08", "2021-10-08", "2021-10-08",
"2021-10-11", "2021-10-14", "2021-10-19", "2021-10-20", "2021-10-21",
"2021-10-22", "2021-10-22", "2021-10-29", "2021-11-02", "2021-11-03",
"2021-11-08", "2021-11-09", "2021-11-16", "2021-11-16", "2021-11-17"
)), row.names = c(229L, 8247L, 3068L, 7222L, 3746L, 3912L, 8019L,
3610L, 6078L, 6085L, 6271L, 6284L, 6285L, 6310L, 6321L, 6335L,
6336L, 3697L, 9149L, 8217L, 3734L, 220L, 6729L, 5562L, 7729L,
7933L, 5291L, 7232L, 1647L, 7335L, 3418L, 7189L, 2912L, 7790L,
6088L, 6247L, 6281L, 6338L, 7608L, 6614L, 410L, 2746L, 8296L,
3117L, 177L, 2788L, 3301L, 6221L, 5173L, 2092L, 3577L, 6219L,
6973L, 9020L, 1274L, 1768L, 8218L, 1822L, 2499L, 8107L, 1910L,
4756L, 2739L, 7342L, 7857L, 6519L, 2104L, 3666L, 7506L, 2635L,
3402L, 5566L, 2637L, 3036L, 2976L, 3871L, 8376L, 3112L, 4772L,
6449L, 8200L, 8445L, 3310L, 4005L, 3219L, 8241L, 8266L, 2995L,
3273L, 8401L, 3336L, 3118L, 2272L, 3333L, 3370L, 3952L, 7339L
), class = "data.frame")
Normally, I would do the following but I assume it doesn't work in this case since I am using a date class. How would I do this using dates?
# Filter df to exclude rows that were entered on a date from the list
x[!(x$`Termination Date` %in% dates), ]
When I run your example data, I see the Termination Date column is interpreted as the character class, not the date class.
Here is a solution that uses the tidyverse:
# Input Data
dates = c("2021-03-31", "2021-05-02", "2021-06-30", "2021-10-22")
x = structure(list(Gender = c("Male", "Female", "Male", "Male", "Female",
"Male", "Female", "Female", "Female", "Female", "Female", "Female",
"Female", "Female", "Female", "Female", "Male", "Male", "Female",
"Female", "Female", "Male", "Female", "Female", "Male", "Female",
"Male", "Female", "Female", "Female", "Male", "Male", "Female",
"Female", "Female", "Female", "Female", "Female", "Female", "Male",
"Female", "Male", "Female", "Male", "Female", "Female", "Female",
"Male", "Male", "Female", "Female", "Female", "Male", "Male",
"Female", "Female", "Female", "Male", "Female", "Male", "Female",
"Male", "Female", "Female", "Female", "Female", "Female", "Female",
"Female", "Female", "Male", "Female", "Female", "Female", "Female",
"Female", "Male", "Male", "Female", "Male", "Female", "Female",
"Male", "Female", "Female", "Female", "Female", "Female", "Male",
"Female", "Female", "Male", "Female", "Female", "Female", "Female",
"Female"), `Termination Date` = c("2021-01-05", "2021-02-12",
"2021-02-22", "2021-02-24", "2021-03-12", "2021-03-12", "2021-03-24",
"2021-03-26", "2021-03-31", "2021-03-31", "2021-03-31", "2021-03-31",
"2021-03-31", "2021-03-31", "2021-03-31", "2021-03-31", "2021-03-31",
"2021-04-02", "2021-04-02", "2021-04-05", "2021-04-09", "2021-04-30",
"2021-05-05", "2021-05-11", "2021-05-11", "2021-05-14", "2021-05-21",
"2021-05-21", "2021-05-24", "2021-06-01", "2021-06-11", "2021-06-11",
"2021-06-14", "2021-06-24", "2021-06-27", "2021-06-27", "2021-06-27",
"2021-06-27", "2021-07-02", "2021-07-07", "2021-07-23", "2021-07-26",
"2021-07-26", "2021-07-27", "2021-07-30", "2021-08-02", "2021-08-06",
"2021-08-06", "2021-08-09", "2021-08-11", "2021-08-13", "2021-08-13",
"2021-08-13", "2021-08-13", "2021-08-16", "2021-08-18", "2021-08-20",
"2021-08-23", "2021-08-24", "2021-08-25", "2021-08-27", "2021-08-27",
"2021-08-30", "2021-08-30", "2021-08-31", "2021-09-01", "2021-09-03",
"2021-09-03", "2021-09-15", "2021-09-16", "2021-09-20", "2021-09-22",
"2021-09-23", "2021-09-23", "2021-09-24", "2021-09-24", "2021-10-01",
"2021-10-04", "2021-10-06", "2021-10-08", "2021-10-08", "2021-10-08",
"2021-10-11", "2021-10-14", "2021-10-19", "2021-10-20", "2021-10-21",
"2021-10-22", "2021-10-22", "2021-10-29", "2021-11-02", "2021-11-03",
"2021-11-08", "2021-11-09", "2021-11-16", "2021-11-16", "2021-11-17"
)), row.names = c(229L, 8247L, 3068L, 7222L, 3746L, 3912L, 8019L,
3610L, 6078L, 6085L, 6271L, 6284L, 6285L, 6310L, 6321L, 6335L,
6336L, 3697L, 9149L, 8217L, 3734L, 220L, 6729L, 5562L, 7729L,
7933L, 5291L, 7232L, 1647L, 7335L, 3418L, 7189L, 2912L, 7790L,
6088L, 6247L, 6281L, 6338L, 7608L, 6614L, 410L, 2746L, 8296L,
3117L, 177L, 2788L, 3301L, 6221L, 5173L, 2092L, 3577L, 6219L,
6973L, 9020L, 1274L, 1768L, 8218L, 1822L, 2499L, 8107L, 1910L,
4756L, 2739L, 7342L, 7857L, 6519L, 2104L, 3666L, 7506L, 2635L,
3402L, 5566L, 2637L, 3036L, 2976L, 3871L, 8376L, 3112L, 4772L,
6449L, 8200L, 8445L, 3310L, 4005L, 3219L, 8241L, 8266L, 2995L,
3273L, 8401L, 3336L, 3118L, 2272L, 3333L, 3370L, 3952L, 7339L
), class = "data.frame")
library(dplyr)
df_without_undesired_dates <- x %>%
filter(!`Termination Date` %in% dates)

Factors (the data structure) in R

I am teaching myself R using a textbook written by Richard Cotton and have gotten to the chapter on factors. The book says that for
heights <- data.frame(height_cm = c(153, 181, 150, 172, 165, 149, 174, 169, 198, 163),
gender = c("female", "male", "female", "male", "male",
"female", "female", "male", "male", "female"))
heights$gender is a factor but whenever I call class(heights$gender) it returns "character". My question is: why is it a character vector instead of a factor like the textbook says it should be?

X- axis labels are not properly aligned in R barplot

I have data.table data to create a stacked chart and with grouping using below code:
causesDf <- causesDf[, c('Type', 'Gender', 'Total')]
causesSort <- causesDf[, lapply(.SD, sum),
by=list(causesDf$Type, causesDf$Gender)]
and Data will be like below:
causesDf causesDf.1 Total
1: Illness (Aids/STD) Female 2892
2: Change in Economic Status Female 4235
3: Cancellation/Non-Settlement of Marriage Female 6126
4: Family Problems Female 133181
5: Illness (Aids/STD) Male 5831
6: Change in Economic Status Male 31175
7: Cancellation/Non-Settlement of Marriage Male 5170
and so on..
I am trying to make barplot like below:
barpos <- barplot(sort(causesSort$Total, decreasing=TRUE),
col=c("red","green"), xlab="", ylab="",
horiz=FALSE, las=2)
legend("topright", c("Male","Female"), fill=c("red","green"))
end_point <- 0.2 + nrow(causesSort) + nrow(causesSort) - 0.1
text(seq(0.1, end_point, by=1), par("usr")[3] - 30,
srt=60, adj= 1, xpd=TRUE,
labels=paste(causesSort$causesDf), cex=0.65)
but X-labels are not aligning properly, did I miss anything?
Expected output like:
Edited:
causesSort
structure(list(causesDf = c("Illness (Aids/STD)", "Change in Economic Status",
"Cancellation/Non-Settlement of Marriage", "Physical Abuse (Rape/Incest Etc.)",
"Dowry Dispute", "Family Problems", "Ideological Causes/Hero Worshipping",
"Other Prolonged Illness", "Property Dispute", "Fall in Social Reputation",
"Illegitimate Pregnancy", "Failure in Examination", "Insanity/Mental Illness",
"Love Affairs", "Professional/Career Problem", "Divorce", "Drug Abuse/Addiction",
"Not having Children(Barrenness/Impotency", "Causes Not known",
"Unemployment", "Poverty", "Death of Dear Person", "Cancer",
"Suspected/Illicit Relation", "Paralysis", "Property Dispute",
"Unemployment", "Poverty", "Family Problems", "Illness (Aids/STD)",
"Drug Abuse/Addiction", "Other Prolonged Illness", "Death of Dear Person",
"Causes Not known", "Cancer", "Not having Children(Barrenness/Impotency",
"Cancellation/Non-Settlement of Marriage", "Paralysis", "Physical Abuse (Rape/Incest Etc.)",
"Professional/Career Problem", "Love Affairs", "Fall in Social Reputation",
"Dowry Dispute", "Ideological Causes/Hero Worshipping", "Illegitimate Pregnancy",
"Failure in Examination", "Change in Economic Status", "Insanity/Mental Illness",
"Divorce", "Suspected/Illicit Relation", "Not having Children (Barrenness/Impotency",
"Not having Children (Barrenness/Impotency"), causesDf.1 = c("Female",
"Female", "Female", "Female", "Female", "Female", "Female", "Female",
"Female", "Female", "Female", "Female", "Female", "Female", "Female",
"Female", "Female", "Female", "Female", "Female", "Female", "Female",
"Female", "Female", "Female", "Male", "Male", "Male", "Male",
"Male", "Male", "Male", "Male", "Male", "Male", "Male", "Male",
"Male", "Male", "Male", "Male", "Male", "Male", "Male", "Male",
"Male", "Male", "Male", "Male", "Male", "Female", "Male"), Total = c(2892,
4235, 6126, 2662, 31206, 133181, 776, 69072, 4601, 4697, 2391,
12054, 33352, 21339, 1596, 2535, 1205, 5523, 148134, 3748, 7905,
4707, 2878, 8093, 2284, 14051, 23617, 24779, 208771, 5831, 28841,
125493, 5614, 304985, 6180, 2299, 5170, 5002, 1330, 10958, 23700,
8767, 764, 1342, 103, 14951, 31175, 60877, 1598, 6818, 544, 222
)), row.names = c(NA, -52L), class = c("data.table", "data.frame"
)
# , .internal.selfref = <pointer: 0x00000000098d1ef0> # seems not to work
)
If you don't rely on 45° rotation (that one is a bit more tricky) you could use this solution.
First we need to reshape the data by sex.
library(reshape2)
df2 <- dcast(causesSort, ... ~ causesDf.1 , value.var="Total")
Then we generate rownames from the type column and delete this column.
rownames(df2) <- df2[, 1]
df2 <- df2[, -1]
Then we order the data by one column, e.g. by Female.
df2 <- df2[order(-df2$Female), ]
The labels are the rownames.
# labs <- rownames(df2)
However, since they are very long (and bad for the reader's eye!), we may have to think of shorter ones. A workaround is to shorten them a little.
labs <- substr(sapply(strsplit(rownames(df2), " "),
function(x) x[1]), 1, 8)
Now we are able to apply barplot().
pos <- barplot(t(df2), beside=TRUE, xaxt="n",
col=c("#3C6688", "#45A778"), border="white")
pos gives us a matrix of bar positions, because we have a grouped plot we need the column means. We can use it to plot the axis.
axis(1, colMeans(pos), labs, las=2)
Result
Here is ggplot2 solution. This may provide better control over the final output
library(dplyr)
library(ggplot2)
#Rename columns names
names(causesDf) <- c('Type', 'Gender', 'Total')
#sort male before females
causesDf$Gender<-factor(causesDf$Gender, levels=c("Male", "Female"), ordered=TRUE)
#sort types by total sum and sort in decreasing order
sorted<-causesDf %>% group_by(Type) %>% summarize(gtotal=sum(Total)) %>% arrange(desc(gtotal))
causesDf$Type<-factor(causesDf$Type, levels=sorted$Type, ordered=TRUE)
#plot graph
g<-ggplot(causesDf, aes(x=Type, y=Total, group=Gender, fill=Gender)) +
geom_col(position = "dodge") +
theme(axis.text.x = element_text(angle = 45, hjust=1)) +
scale_fill_manual(values = alpha(c("blue", "green"), .5))
print(g)

Adding different vertical lines on facets ggplot2 [duplicate]

This question already has answers here:
facet_wrap add geom_hline
(2 answers)
Closed 5 months ago.
I am quite a newbie to RStudio and I am having problems adding different vertical lines on each of my two facets using facet_wrap
Here is what I thought:
library(ggplot2)
g <- ggplot(dat, aes(x=index, na.rm= TRUE))
d <- g+ geom_density() + facet_wrap(~gender)
vline.data <- data.frame(z = c(2.36,2.48),gender = c("Female","Male"))
d1 <- d + geom_vline(aes(xintercept = z),vline.data)
But it adds the same two lines to each facet - what would you reckon is the problem? I have thought of somehow splitting the facets into two separate data frames, but I have no idea how to go on about it.
P.S The x-axis (the index) goes from 1 to 4.
Thank you in advance.
index <- c(NA, NA, 4, 4, 4, NA, NA, NA, NA, 2, 2, 2, 2, 2, 3, 3, 3, 3,
3, NA, 3, NA, NA, 3, 4, 4, 4, 4, 4, 3, 4, 3, 4, 3, NA, 4, 2,
4, 4, 2, 2, NA, 2, 3, 3, 2, 2, NA, NA, 2)
gender <- c("Female", "Female", "Male", "Male", "Male", "Female", "Female",
"Male", "Female", "Male", "Female", "Male", "Female", "Male",
"Male", "Female", "Female", "Female", "Male", "Female", "Female",
"Male", "Male", "Female", "Female", "Female", "Male", "Male",
"Female", "Female", "Male", "Female", "Female", "Female", "Male",
"Male", "Female", "Male", "Male", "Female", "Female", "Male",
"Male", "Female", "Female", "Male", "Male", "Male", "Male", "Male")
Using the code and data above I get this plot
This was asked a long time ago, but here's the answer:
You need to add geom_vline to your ggplot. geom_vline takes a single value, not a vector, and it doesn't iterate, so you can't have multiple values in a separate data object from the rest of your data. Merge vline.data as a new column with the rest of dat as is appropriate for your data set. This question doesn't have reproducible data, but put index,gender and however you calculate vline.data into a single dat dataframe with each of those pieces as column names. Then you can summarise the vline.data. The example below assumes the values within vline.data are the mean values for the entire index column.
p <- ggplot(dat, aes(x=index, na.rm= TRUE)) +
geom_density() +
facet_wrap(. ~ gender) +
geom_vline(data = . %>% group_by(gender) %>% summarise(vl=mean(index)),
aes(xintercept=vl))
How to Add Lines With A Facet R

Resources