Very new to R!
I have a survey with people answering from 0 to 10. I want to add up how many people were <= 6. How many 7 and 8. How many >=9.
I had to turn the questions (Return, Trustworthy...) into a factors to make a ggplots with 1 to 10 on the x axis.
uk_super_q<-read.csv("SUPR_Q_UK.csv", header = TRUE)
uk_super_q.Return <- as.factor(uk_super_q$Return)
uk_super_q.Trustworthy <- as.factor(uk_super_q$Trustworthy)
uk_super_q.Credible <- as.factor(uk_super_q$Credible)
uk_super_q.Trustworthy <- as.factor(uk_super_q$Trustworthy)
uk_super_q.Clean.and.Simple <- as.factor(uk_super_q$Clean.and.Simple)
uk_super_q.Easy.to.use <- as.factor(uk_super_q$Easy.to.use)
uk_super_q.Attractive <- as.factor(uk_super_q$Attractive)
uk_super_q.NPS <- as.factor(uk_super_q$NPS)
uk_super_q$Return <- as.factor(uk_super_q$Return)
ggplot(uk_super_q, aes(x = Return)) +
geom_bar() +
xlab("Return") +
ylab("Total Count") +
labs(fill = "Blah")
table(uk_super_q.Return)
uk_super_q$Easy.Nav <- as.factor(uk_super_q$Easy.Nav)
ggplot(uk_super_q, aes(x = Easy.Nav)) +
geom_bar() +
xlab("Easy.Nav") +
ylab("Total Count") +
labs(fill = "Blah")
table(uk_super_q.Trustworthy)
uk_super_q$Credible <- as.factor(uk_super_q$Credible)
ggplot(uk_super_q, aes(x = Credible)) +
geom_bar() +
xlab("Credible") +
ylab("Total Count") +
labs(fill = "Blah")
table(uk_super_q.Credible)
uk_super_q$Attractive <- as.factor(uk_super_q$Attractive)
ggplot(uk_super_q, aes(x = Attractive)) +
geom_bar() +
xlab("Attractive") +
ylab("Total Count") +
labs(fill = "Blah")
table(uk_super_q.Attractive)
uk_super_q$Trustworthy <- as.factor(uk_super_q$Trustworthy)
ggplot(uk_super_q, aes(x = Trustworthy)) +
geom_bar() +
xlab("Trustworthy") +
ylab("Total Count") +
labs(fill = "Blah")
table(uk_super_q.Trustworthy)
uk_super_q$Clean.and.Simple <- as.factor(uk_super_q$Clean.and.Simple)
ggplot(uk_super_q, aes(x = Clean.and.Simple)) +
geom_bar() +
xlab("Clean.and.Simple") +
ylab("Total Count") +
labs(fill = "Blah")
table(uk_super_q.Clean.and.Simple)
uk_super_q$Easy.to.use <- as.factor(uk_super_q$Easy.to.use)
ggplot(uk_super_q, aes(x = Easy.to.use)) +
geom_bar() +
xlab("Easy.to.use") +
ylab("Total Count") +
labs(fill = "Blah")
table(uk_super_q.Easy.to.use)
uk_super_q$NPS <- as.factor(uk_super_q$NPS)
ggplot(uk_super_q, aes(x = NPS)) +
geom_bar() +
xlab("NPS") +
ylab("Total Count")
table(uk_super_q.NPS)
Applying logical statements to a data.frame returns a matrix of TRUE/FALSE values, which are coded in R as 1 and 0, respectively. This allows you to count the number of TRUE values in each column with sum, or more efficiently, with colSums.
colSums(uk_super_q <= 6)
colSums(uk_super_q >= 7 & uk_super_q <= 8)
colSums(uk_super_q >= 9)
Related
I am currently working with a dataset of "world bank islands". In that, I am trying to plot the population Vs country graph for each year. Below is the code that I have done.
library(ggplot2)
options(scipen = 999)
bank <- read.csv("C:/Users/True Gamer/OneDrive/Desktop/world_bank_international_arrivals_islands.csv")
bank[bank == "" | bank == "."] <- NA
bank$country <- as.numeric(bank$country)
bank$year <- as.numeric(bank$year)
bank$areakm2 <- as.numeric(bank$areakm2)
bank$pop <- as.numeric(bank$pop)
bank$gdpnom <- as.numeric(bank$gdpnom)
bank$flights...WB <- as.numeric(bank$flights...WB)
bank$hotels <- as.numeric(bank$hotels)
bank$hotrooms <- as.numeric(bank$hotrooms)
bank$receipt <- as.numeric(bank$receipt)
bank$ovnarriv <- as.numeric(bank$ovnarriv)
bank$dayvisit <- as.numeric(bank$dayvisit)
bank$arram <- as.numeric(bank$arram)
bank$arreur <- as.numeric(bank$arreur)
bank$arraus <- as.numeric(bank$arraus)
str(bank)
plot1 <- ggplot(bank, aes(x=country,y=pop)) + geom_bar(stat = "identity",aes(fill=year)) + ggtitle("Population of each country yearwise") + xlab("Countries") + ylab("Population")
plot1
However, when I do this, the y values shown on the graph are different from the actual y values. This is the link to the dataset
The problem is that you are stacking the bars (this is default behaviour). Also, geom_bar(stat = "identity") is just a long way of writing geom_col. One further point to note is that since all your columns are numeric, the single line:
bank <- as.data.frame(lapply(bank, as.numeric))
replaces all your individual numeric conversions.
The plot you are trying to create would be something like this:
ggplot(bank, aes(x = country, y = pop)) +
geom_col(aes(fill = factor(year)), position = "dodge") +
ggtitle("Population of each country yearwise") +
xlab("Countries") +
ylab("Population") +
labs(fill = "Year") +
scale_y_continuous(labels = scales::comma) +
scale_x_continuous(breaks = 1:27)
However, it would probably be best to present your data in a different way. Perhaps, if you are comparing population growth, something like this would be better:
ggplot(bank, aes(x = year, y = pop)) +
geom_line(aes(color = factor(country)), position = "dodge") +
ggtitle("Population of each country yearwise") +
xlab("Year") +
ylab("Population") +
facet_wrap(.~country, scales = "free_y", nrow = 6) +
scale_y_continuous(labels = scales::comma) +
scale_x_continuous(breaks = c(0, 5, 10)) +
theme_minimal() +
theme(legend.position = "none")
Or with bars:
ggplot(bank, aes(x = year, y = pop)) +
geom_col(aes(fill = factor(country)), position = "dodge") +
ggtitle("Population of each country yearwise") +
xlab("Year") +
ylab("Population") +
facet_wrap(.~country, scales = "free_y", nrow = 6) +
scale_y_continuous(labels = scales::comma) +
scale_x_continuous(breaks = c(0, 5, 10)) +
theme_minimal() +
theme(legend.position = "none")
I have this graph that I want to show the count over the bar, however my code shows the number 1 inside the bars..
What I have:
What I am trying to make:
# Library
library(ggplot2)
# 1. Read data (comma separated)
df = read.csv2(text = "Id;Date
1;2021-06-09
2;2021-06-08
3;2021-06-08
4;2021-06-09
5;2021-06-09")
# 2. Print table
df_date <- df[, "Date"]
df_date <- as.data.frame(table(df_date))
colnames(df_date)[which(names(df_date) == "df_date")] <- "Date" # Set column name to Date
df_date
# 3. Plot bar chart
ggplot(df_date, aes(x = Date, y = Freq)) +
geom_bar(stat = "identity") +
theme_classic() +
ggtitle("Date") +
xlab("Date") +
ylab("Frequency") +
geom_text(stat= "count", aes(label = ..count.., y= ..prop..), vjust = -1)
Since you have already calculated the frequency use geom_col.
library(ggplot2)
ggplot(df_date, aes(x = Date, y = Freq)) +
geom_col() +
theme_classic() +
ggtitle("Date") +
xlab("Date") +
ylab("Frequency") +
geom_text(aes(label = Freq), vjust = -1)
If you use df you can use geom_bar as -
ggplot(df, aes(x = Date)) +
geom_bar() +
theme_classic() +
ggtitle("Date") +
xlab("Date") +
ylab("Frequency") +
geom_text(stat= "count",aes(label = ..count..), vjust = -1)
So the code below is working w/out errors, and I am trying to fix the following issue.
First, I am trying to change the group name for each graph to say, for instance, "< 1500 dollars" to refer to the group of workers earnings $1500 or less etc...
I tried this solution: to change the underlying factor level names but I keep getting this error:
"Error: unexpected ',' in ""< 1500 Dollars",""
outflows <- Wage_Outflows
levels(outflows$wage_group)
"< 1500", "1501 ~ 2999", "3000",
levels(outflows$wage_group) <- c("< 1500 Dollars", "1501 ~ 2999 Dollars", "3000 Dollars")
text.on.each.panel <-"Dollars"
p1 = ggplot(Wage_Outflows[Wage_Outflows$wage_group=="< 1500",], aes(x = year, y = labor)) +
geom_point() +
scale_y_continuous(breaks=seq(4000000, 6500000, by = 400000)) +
facet_wrap(~ wage_group) + theme(axis.title.x = element_blank())
p2 = ggplot(Wage_Outflows[Wage_Outflows$wage_group=="1501 ~ 2999",], aes(x = year, y = labor)) +
geom_point() +
scale_y_continuous(breaks=seq(800000, 1100000, by = 20000)) +
facet_wrap(~ wage_group) + theme(axis.title.x = element_blank())
p3 = ggplot(Wage_Outflows[Wage_Outflows$wage_group=="3000",], aes(x = year, y = labor)) +
geom_point() +
scale_y_continuous(breaks=seq(50000, 120000, by = 5000)) +
facet_wrap(~ wage_group) + theme(axis.title.x = element_blank())
grid.arrange(p1, p2,p3, ncol=1)
For your first question have a look at the labeller argument in the facet_wrap function.
And for your second question the labs function might be the solution.
p1 = ggplot(Wage_Outflows[Wage_Outflows$wage_group=="< 1500",],
aes(x = year, y = labor)) +
geom_point() +
scale_y_continuous(breaks=seq(4000000, 6500000, by = 400000)) +
labs(y = "Number of workers") +
facet_wrap(~ wage_group, labeller = labeller(wage_group = c(`< 1500` = "< 1500
dollars"))) +
theme(axis.title.x = element_blank())
Maybe you can shorten your code like this:
# Example dataset:
df <- data.frame(wage_group = rep(c("A","B","C"), each = 10),
year = 2001:2010,
labor = seq(5000,34000, 1000))
ggplot(df , aes(x = factor(year), y = labor)) +
geom_point() +
labs(y = "# of workers") +
facet_wrap(~wage_group, ncol = 1, scales = "free",
labeller = labeller(wage_group = c(`A` = "less than 1500 dollars",
`B` = "1500-2999 dollars", `C` = "more than 3000 dollars"))) +
theme(axis.title.x = element_blank())
ggplot(all, aes(x=area, y=nq)) +
geom_point(size=0.5) +
geom_abline(data = levelnew, aes(intercept=log10(exp(interceptmax)), slope=fslope)) + #shifted regression line
scale_y_log10(labels = function(y) format(y, scientific = FALSE)) +
scale_x_log10(labels = function(x) format(x, scientific = FALSE)) +
facet_wrap(~levels) +
theme_bw() +
theme(panel.grid.major = element_line(colour = "#808080"))
And I get this figure
Now I want to add one geom_line to one of the facets. Basically, I wanted to have a dotted line (Say x=10,000) in only the major panel. How can I do this?
I don't have your data, so I made some up:
df <- data.frame(x=rnorm(100),y=rnorm(100),z=rep(letters[1:4],each=25))
ggplot(df,aes(x,y)) +
geom_point() +
theme_bw() +
facet_wrap(~z)
To add a vertical line at x = 1 we can use geom_vline() with a dataframe that has the same faceting variable (in my case z='b', but yours will be levels='major'):
ggplot(df,aes(x,y)) +
geom_point() +
theme_bw() +
facet_wrap(~z) +
geom_vline(data = data.frame(xint=1,z="b"), aes(xintercept = xint), linetype = "dotted")
Another way to express this which is possibly easier to generalize (and formatting stuff left out):
ggplot(df, aes(x,y)) +
geom_point() +
facet_wrap(~ z) +
geom_vline(data = subset(df, z == "b"), aes(xintercept = 1))
The key things being: facet first, then decorate facets by subsetting the original data frame, and put the details in a new aes if possible. Other examples of a similar idea:
ggplot(df, aes(x,y)) +
geom_point() +
facet_wrap(~ z) +
geom_vline(data = subset(df, z == "b"), aes(xintercept = 1)) +
geom_smooth(data = subset(df, z == "c"), aes(x, y), method = lm, se = FALSE) +
geom_text(data = subset(df, z == "d"), aes(x = -2, y=0, label = "Foobar"))
I am trying to plot different types of plots (line plot and bar charts) beneath one another, they all have the same axis:
c1 <- ggplot(data, aes(date, TotalMutObs)) + stat_smooth(se = FALSE) +
geom_point() +
opts(axis.title.x = theme_blank()) +
ylab("Cumulative number of new mutations")
c2 <- ggplot(data, aes(date, distance)) + stat_smooth(se = FALSE) +
geom_point() +
opts(axis.title.x = theme_blank()) +
ylab("Cumulative mean pairwise distance")
c3 <- ggplot(data, aes(x = date, y = NbOfHorses)) +
geom_bar(stat = "identity") +
opts(axis.title.x = theme_blank()) +
ylab("Number of horses sampled")
grid.arrange(c1, c2,c3)
However, the dates on the x-axis are not lining up for the different plots.
Here is some data to try it out:
date<-c("2003-03-13","2003-03-25","2003-03-26","2003-03-27","2003-03-28","2003-03-31","2003-04-01","2003-04-02","2003-04-04","2003-04-08","2003-04-09","2003-04-10","2003-04-11","2003-04-14","2003-04-15","2003-04-17","2003-04-19","2003-04-21","2003-04-22","2003-04-28","2003-05-08");
NbOfHorses<-c("1","2","1","3","4","5","4","3","3","3","3","4","2","4","1","2","4","1","2","1","2");
TotalMutObs<-c("20","30","58","72","140","165","204","230","250","286","302","327","346","388","393","414","443","444","462","467","485");
distance<-c("0.000693202","0.00073544","0.000855432","0.000506876","0.000720193","0.000708047","0.000835468","0.000812401","0.000803149","0.000839117","0.000842048","0.000856393","0.000879973","0.000962382","0.000990666","0.001104861","0.001137515","0.001143838","0.00121874","0.001213737","0.001201379");
data<-as.data.frame(cbind(date,NbOfHorses,TotalMutObs,distance));
Cheers,
Joseph
The way to solve this problem is to work within ggplot2 and get creative about stacking copies of your data and then sending subsets to each geom that you need.
#A version of your data cleaned up
dat <- data.frame(date = as.Date(date),NbOfHorses = as.numeric(NbOfHorses),
TotalMutObs = as.numeric(TotalMutObs),distance = as.numeric(distance))
#Create three copies, one for each panel
# Use informative titles for grp to be panel titles
fullDat <- rbind(dat,dat,dat)
fullDat$grp <- rep(c('Cumulative number of new mutations',
'Cumulative mean pairwise distance',
'Number of horses sampled'),each = nrow(dat))
ggplot(fullDat,aes(x = date)) +
facet_wrap(~grp,nrow = 3,scale = "free_y") +
geom_point(data = subset(fullDat,grp == 'Cumulative number of new mutations'),
aes(y = TotalMutObs)) +
stat_smooth(data = subset(fullDat,grp == 'Cumulative number of new mutations'),
aes(y = TotalMutObs),se = FALSE) +
geom_point(data = subset(fullDat,grp == 'Cumulative mean pairwise distance'),
aes(y = distance)) +
stat_smooth(data = subset(fullDat,grp == 'Cumulative mean pairwise distance'),
aes(y = distance),se = FALSE) +
geom_bar(data = subset(fullDat,grp == 'Number of horses sampled'),
aes(y = NbOfHorses),stat = "identity") +
labs(x = NULL,y = NULL)