plot r two categorical variables - r

I am using below command to plot two categorical variables in R
gender has 2 levels and Income has 9 levels.
spineplot(main$Gender,main$Income, xlab="Gender", ylab="Income levels: 1 is lowest",xaxlabels=c("Male","Female"))
It produces chart like below
How can i plot this chart in color?
How can i show % of each income level within each box? for example female income level 1 has 21% of data. How can i show 21% within the dark colored area?
################update 1
Adding reproducible example
fail <- factor(c(2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1,
1, 1, 1, 2, 1, 1, 1, 1, 1,2,2,2,2),
levels = c(1, 2), labels = c("male", "female"))
gender <- factor(rep(c(1:9),3))
spineplot(fail,gender)

I think it may be easier to do this with a barplot since spineplot doesn't return anything useful.
The default would be the following, but you can adjust the widths of the bars to some other variable (you can see the x-axis coordinates are returned):
par(mfrow = 1:2)
(barplot(table(gender, fail)))
# [1] 0.7 1.9
(barplot(table(gender, fail), width = table(fail)))
# [1] 10.7 26.9
With some final touches we get
tbl <- table(gender, fail)
prp <- prop.table(tbl, 2L)
yat <- prp / 2 + apply(rbind(0, prp[-nrow(prp), ]), 2L, cumsum)
bp <- barplot(prp, width = table(fail), axes = FALSE, col = rainbow(nrow(prp)))
axis(2L, at = yat[, 1L], labels = levels(gender), lwd = 0)
axis(4L)
text(rep(bp, each = nrow(prp)), yat, sprintf('%0.f%%', prp * 100), col = 0)
Compare to
spineplot(fail, gender, col = rainbow(nlevels(gender)))

An alternative to the interesting solution of #rawr is:
fail <- factor(c(2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1,
1, 1, 1, 2, 1, 1, 1, 1, 1,2,2,2,2),
levels = c(1, 2), labels = c("male", "female"))
gender <- factor(rep(c(1:9),3))
mypalette <- colorRampPalette(c("lightblue","darkblue"))
tbl <- spineplot(fail, gender, xlab="Gender", ylab="Income levels: 1 is lowest",
xaxlabels=c("Male","Female"), col=mypalette(nlevels(gender)) )
print(tbl)
# Income levels: 1 is lowest
# Gender 1 2 3 4 5 6 7 8 9
# male 2 1 2 1 3 2 2 2 1
# female 1 2 1 2 0 1 1 1 2
print.perc <- function(k, tbl, ndigits=2, str.pct="%") {
# These lines of codes are the same used by from spineplot
# for the calculation of the x-position of the stacked bars
nx <- nrow(tbl)
off <- 0.02
xat <- c(0, cumsum(prop.table(margin.table(tbl, 1)) + off))
posx <- (xat[1L:nx] + xat[2L:(nx + 1L)] - off)/2
# Proportions by row (gender)
ptbl <- prop.table(tbl,1)
# Define labels as strings with a given format
lbl <- paste(format(round(100*ptbl[k,], ndigits), nsmall=ndigits), str.pct, sep="")
# Print labels
# cumsum(ptbl[k,])-ptbl[k,]/2 is the vector of y-positions
# for the centers of each stacked bar
text(posx[k], cumsum(ptbl[k,])-ptbl[k,]/2, lbl)
}
# Print income levels for males and females
strsPct <- c("%","%")
for (k in 1:nrow(tbl)) print.perc(k, tbl, ndigits=2, str.pct=strsPct[k])
Hope it can help you.

Related

How can I match coordinates with their associated values in R

I have 250 points that I generated within a rectangle (-4,4)x(-6,6). If the popints are within a certain space they are blue and if they are outside of that space they are red.
The code I used for this is here, where i defined the confined space with squares:
library(sf)
border <- matrix(c(
-6, -4,
-6, 4,
6, 4,
6, -4,
-6, -4
), ncol = 2, byrow = TRUE) |>
sfheaders::sfc_polygon()
# sample random points
rand_points <- st_sample(border, size = 250)
squares1 <- matrix(c(
-4, 0,
-4, 3,
-1, 3,
-1, 0,
-4, -0
), ncol = 2, byrow = TRUE) |>
sfheaders::sfc_polygon()
squares2 <- matrix(c(
-2, -4,
-2, -1,
1, -1,
1, -4,
-2, -4
), ncol = 2, byrow = TRUE) |>
sfheaders::sfc_polygon()
squares3 <- matrix(c(
2, -2,
2, 1,
5, 1,
5, -2,
2, -2
), ncol = 2, byrow = TRUE) |>
sfheaders::sfc_polygon()
squares <- c(squares1, squares2, squares3)
red_vals <- st_difference(rand_points, squares)
blue_vals <- st_intersection(rand_points, squares)`
plot(border)
plot(negative_vals, add = TRUE, col = "red")
plot(positive_vals, add = TRUE, col = "blue")
My goal is to match the points' coordinates with their expected value. Example:
In the table, the third column is for the blue points and the fourth column for the red. If the point at that coordinate is blue it gets a +1 and if it is not blue at that coordinate -1, and vice versa for the red points.
So far, I have attained the coordinates of all the points.
y <- c(red_vals)
x <- c(blue_vals)
cdata <- c(x, y)
coord <- st_coordinates(cdata)`
I am now stuck on trying to figure out how I can classify x and y to their respective coordinates and indicate this in a dataframe.
Any help is appreciated.
You could do:
red_vals <- rand_points[rowSums(st_intersects(rand_points, squares, F)) == 0]
blue_vals <- st_intersection(rand_points, squares)
df <- rbind(cbind(st_coordinates(red_vals), PosGroup = 1, NegGroup = -1),
cbind(st_coordinates(red_vals), PosGroup = -1, NegGroup = 1)) |>
as.data.frame()
head(df)
#> X Y PosGroup NegGroup
#> X1 -5.2248158 0.03710509 1 -1
#> X2 -5.8932331 -1.41421992 1 -1
#> X3 -0.0609895 0.26541100 1 -1
#> X4 1.7345333 -3.04312404 1 -1
#> X5 -4.6801643 0.24656851 1 -1
#> X6 1.3190239 3.36491623 1 -1
Obviously the first few values are all red dots.
We can see that the points are correct by using this data frame to draw points in ggplot:
library(ggplot2)
df %>%
ggplot() +
geom_sf(data = squares) +
geom_point(aes(X, Y, color = factor(PosGroup)), pch = 1, size = 3) +
theme_classic() +
scale_color_brewer(palette = "Set1", direction = -1)

How to export the frequency tables (categorical data) and descreptive analysis (continuous variables) from a big dataframe from R into excel?

I have this dataframe:
df = data.frame(x = c(1,0,0,0,1,1,1), y = c(2,2,2,2,3,3,2),
z = 1:7, m = c(1,2,3,1,2,3,1) )
df$x = factor(df$x)
df$y = factor(df$y)
df$m = factor(df$m)
I want to extract all the descriptive analysis information of each of these variables into excel in a simple way so I can present the results of my work.
There is little information here and it unclear how you want to present this, but if you would like to get summary statistics (tables [categorical] and median/IQR [continuous]), you can use lapply across all columns. This will output as a list from which you can manipulate to export as you see fit.
Data
df <- data.frame(x = c(1, 0, 0, 0, 1, 1, 1),
y = c(2, 2, 2, 2, 3, 3, 2),
z = 1:7,
m = c(1, 2, 3, 1, 2, 3, 1))
cats <- c("x", "y", "m")
df[cats] <- lapply(df[cats], as.factor)
Run lapply
results_list <- lapply(df[1:ncol(df)], summary)
Output:
> results_list
$x
0 1
3 4
$y
2 3
5 2
$z
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.0 2.5 4.0 4.0 5.5 7.0
$m
1 2 3
3 2 2

How to create ggplot box plot which add data over time

I have a dataframe, which gives values for different courses over a series of weeks.
Course Week m
1 UGS200H 1 44.33333
2 CMSE201 1 73.66667
3 CMSE201 2 88.16667
4 CMSE201 2 88.16667
5 PHY215 2 73.66667
6 PHY215 3 86.33333
7 CMSE201 3 84.00000
8 UGS200H 4 60.66667
9 UGS200H 4 76.66667
I would like to create a series of box plots which plot m values over the weeks for each course. I would like for the box plots to build off of each other though, such that week 1 contains only the data from Week = 1, but week 2 contains data including data from Week = 1 and 2, and week 3 includes data from Week = 1,2,3 and etc. I have create the following code which creates the box plots but without the building up over the weeks.
d <- subset(data_manual)
a <- ggplot(data=d, aes(x=(Week), fill = Course, y=(m), group=interaction(Course, Week)))
geom_boxplot()+
scale_y_continuous(limits = c(-2, 100), breaks = seq(0, 100, by = 20))+
xlab('Week') +
ylab('Course-Level SE') +
print(a) #show us the plot!!
}
This gives plots like this
But these are just individual weeks, not the summed version that I would like. Is there a way to have them build and plot the multiple weeks on one plot?
How about this:
# dat <- tibble::tribble(
# ~Course, ~Week, ~m,
# "UGS200H", 1, 44.33333,
# "CMSE201", 1, 73.66667,
# "CMSE201", 2, 88.16667,
# "CMSE201", 2, 88.16667,
# "PHY215", 2, 73.66667,
# "PHY215", 3, 86.33333,
# "CMSE201", 3, 84.00000,
# "UGS200H", 4, 60.66667,
# "UGS200H", 4, 76.66667)
dat <- data.frame(
Course = rep(c("A", "B", "C"), each=1000),
Week = rep(rep(1:10, each=100), 3),
m = runif(3000, 50, 100)
)
library(ggplot2)
dats <- lapply(1:max(dat$Week), \(i){
tmp <- subset(dat, Week <= i)
tmp$plot_week <- i
tmp})
dats <- do.call(rbind, dats)
table(dat$Week)
#>
#> 1 2 3 4 5 6 7 8 9 10
#> 300 300 300 300 300 300 300 300 300 300
table(dats$plot_week)
#>
#> 1 2 3 4 5 6 7 8 9 10
#> 300 600 900 1200 1500 1800 2100 2400 2700 3000
ggplot(data=dats, aes(x=as.factor(plot_week), fill = Course, y=(m), group=interaction(Course, plot_week))) +
geom_boxplot()+
scale_y_continuous(limits = c(-2, 100), breaks = seq(0, 100, by = 20))+
xlab('Week') +
ylab('Course-Level SE')
Created on 2022-10-18 by the reprex package (v2.0.1)
Basically the same idea as by #DaveArmstrong but using lapply with multiple geom_boxplots.
Note 1: To make the example a bit more realistic I use some random fake example data.
Note 2: I added an additional geom_point just to check that the number of obs. is actually increasing for each week.
set.seed(123)
d <- data.frame(
Course = rep(c("UGS200H", "CMSE201", "PHY215"), each = 40),
Week = rep(1:4, 30),
m = runif(120, 40, 100)
)
library(ggplot2)
ggplot(data=d, aes(x = factor(Week), fill = Course, y=m)) +
lapply(unique(d$Week), function(x) {
list(
geom_boxplot(data = subset(d, Week <= x) |> transform(Week = x), position = "dodge"),
geom_point(data = subset(d, Week <= x) |> transform(Week = x), position = position_dodge(.9), alpha = .2)
)
}) +
labs(x = 'Week', y = 'Course-Level SE')

Ploting barplot differently in R

Suppose I have the following data frame:
> example
col1 col2 col3
1 -1 1 -1
2 0 -1 3
3 1 10 -1
and I want to plot a barplot and using row 3 as an example, I do barplot(example[3,]). This works perfectly. However, I want to flip the value and add more color -- specifically, I want:
if the value is negative (i.e., -1 in row 3), I want to flip it into +1 and color red when plotting the boxplot. (but note that there are +1 value in the row already and we don't want to color that in red)
if the value is >= +10, color the column green in the boxplot
How can I do the above?
> dput(example)
structure(list(col1 = c(-1, 0, 1), col2 = c(1, -1, 10), col3 = c(-1,
3, -1)), row.names = c(NA, 3L), class = "data.frame")
Here it is a complet solution using only base R :
df <- structure(list(col1 = c(-1, 0, 1),
col2 = c(1, -1, 10),
col3 = c(-1, 3, -1)),
row.names = c(NA, 3L),
class = "data.frame")
# apply conditions on the matrix to color your plot
# If I well understand your demand, it is to have specific color with
# respect to specific condition. Multiply by 2 is to have different factor
level
color <- (df >= 10) * 2 + (df < 0) + 1
# to swap -1 to +1 do that :
df[df < 0] <- df[df < 0] + 2
# set color as wishes
color <- matrix(c("black", "red", "green")[color], nrow = nrow(color), ncol = ncol(color), byrow = F)
# plot the vector we want
barplot(df[,3], col = color[,3])
EDIT 1
To plot the row 3 you can use this trick with transposition function t() :
barplot(t(df)[, 3], col = t(color)[, 3])

For each row, return the column name of the largest value whilst assigning ties to new groups

I have three variables; and I want to create a new varible showing which column that had the highest number. Data:
x= c(5, 1, 4, 5, 5, 1, 1)
y= c(1, 2, 4, 5, 1, 4, 1)
z= c(1, 1, 5, 3, 5, 4, 1)
data <-data.frame(x, y, z)
Importantly if there are a tie I want this to be indicated too, so that.
1= x is highest
2= y is highest
3= z is highest
4= x and y is highest as tie
5= x and z is highest as a tie
6 = y and z is highest as a tie
7 = x, y and z is all equally high.
I've tried below, but it doesn't handle the ties correctly.
data$Highest <- apply(data, 1, which.max)
data
PS. The correct new variable that I would like to get from the data above should be:
correct= c(1, 2, 3, 4, 5, 6, 7)
fun <- function(v) {
stopifnot(length(v) == 3L)
if (anyNA(v)) stop("NA values in input")
if (length(unique(v)) == 1L) return(7L)
rk <- rank(v)
if (max(rk) %% 1 == 0L) return(which.max(rk))
test <- rk %% 1 != 0L
if (sum(test) == 2L) return(sum(which(test)) + 1L)
stop("undefined case")
}
apply(data, 1, fun)
#[1] 1 2 3 4 5 6 7
You can do:
library(plyr)
combn2 <- function(x, y) combn(y, x, paste, collapse="")
x = unlist(sapply(1:ncol(data), combn2, names(data)))
vec = alply(data, 1, function(u) which(paste(names(data)[max(u)==u], collapse='')==x))
#unlist(vec)
#1 2 3 4 5 6 7

Resources