Correlation Coefficients between population and suicide in Rstudio - r

Percentage Change
selectedCountry <- dataset %>%
filter(country == 'Japan') %>%
select(population, suicides_no, year) %>%
group_by(year) %>%
summarise(s_count = sum(suicides_no), p_count = sum(population))
year s_count p_count
<dbl> <dbl> <dbl>
1 1979 20711 107268500
2 1980 20416 108473500
3 1981 19976 109674700
4 1982 20535 110722900
5 1983 24853 111070000
6 1984 24221 111950000
I want to find the correlation between population and suicide after I aggregate population and suicide like this
percentage <- selectedCountry %>%
arrange(year) %>%
mutate(pct.chg.s = 100 * (s_count - lag(s_count,default=first(s_count))) / lag(s_count,default=first(s_count))) %>%
mutate(pct.chg.p = 100 * (p_count - lag(p_count,default=first(p_count))) / lag(p_count,default=first(p_count))) %>%
mutate(correlation = cor(pct.chg.s, pct.chg.p))
head(percentage)
I end up with a result
year s_count p_count pct.chg.s pct.chg.p correlation
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1979 20711 107268500 0 0 0.00789
2 1980 20416 108473500 -1.42 1.12 0.00789
3 1981 19976 109674700 -2.16 1.11 0.00789
4 1982 20535 110722900 2.80 0.956 0.00789
5 1983 24853 111070000 21.0 0.313 0.00789
6 1984 24221 111950000 -2.54 0.792 0.00789
to plot the corr between two variables using two way (I can't sure if this true or false)
ggscatter(percentage, x = "pct.chg.s", y = "pct.chg.p",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "Percentage Change in Suicide",
ylab = "Percentage Change in Population",
color = "blue", shape = 19,
palette = c("#00AFBB", "#E7B800", "#FC4E07"),
ellipse = TRUE, mean.point = TRUE,
star.plot = TRUE)
ggplot(percentage, aes(x = pct.chg.p, y = pct.chg.s)) +
geom_point() +
geom_smooth(method = "lm",formula = y ~ x) +
labs(title = "Correlation between Pubulation and Suicides",
x = "Percentage Change in Suicide",
y = "Percentage Change in Population")

Related

Predicted values from a log model using add_predictions() are a long way off

I've fit a log model to some data, which works nicely:
model_data <- structure(list(standard_conc_ngul = c(50, 50, 50, 5, 5, 0.5,
0.5, 0.05, 0.05, 0.005, 0.005, 0.005), ct = c(18.3305377960205,
18.133768081665, 17.8813705444336, 21.5002365112305, 21.4915542602539,
22.7616996765137, 23.6836719512939, 25.3699340820312, 25.3488445281982,
28.984302520752, 26.7397594451904, 27.8844776153564)), row.names = c(NA,
-12L), class = c("tbl_df", "tbl", "data.frame"))
> print(model_data)
# A tibble: 12 × 2
standard_conc_ngul ct
<dbl> <dbl>
1 50 18.3
2 50 18.1
3 50 17.9
4 5 21.5
5 5 21.5
6 0.5 22.8
7 0.5 23.7
8 0.05 25.4
9 0.05 25.3
10 0.005 29.0
11 0.005 26.7
12 0.005 27.9
model <- lm(log(standard_conc_ngul) ~ ct, data = model_data)
> summary(model)$r.squared
[1] 0.9727793
ggplot(model_data, aes(x = standard_conc_ngul, y = ct)) +
geom_point() +
stat_smooth(method = lm, formula = y ~ log(x)) +
stat_poly_eq(formula = y ~ log(x),
aes(label = paste(after_stat(eq.label), after_stat(rr.label), sep = "~~~")),
parse = TRUE, coef.digits = 3, f.digits = 3, p.digits = 3,
rr.digits = 3)
I'd now like to use this model to predict standard_conc_ngul from ct in new data. Before I do that I wanted to test the predictions in the same model data above. However, when I do it they're nowhere near:
library(modelr)
model_data %>%
add_predictions(model, var = "standard_conc_ngul_predicted") %>%
ggplot(aes(x = standard_conc_ngul, y = standard_conc_ngul_predicted)) +
geom_point()
Any idea what's going on??
The predictions are calculating the natural log of your outcome to match the equation. Correcting them using exp gives comparable values:
library(tidyverse)
model_data %>%
modelr::add_predictions(model, var = "standard_conc_ngul_predicted") |>
mutate(pred = exp(standard_conc_ngul_predicted)) |>
ggplot(aes(x = standard_conc_ngul, y = pred)) +
geom_point()
Is this what you're looking for?

Calculation of "average sales share " with dplyr::mutate

My data concerns a company and includes Total Sales and the amount of sales in three counties CA , TX and WI.
Data :
> dput(head(WalData))
structure(list(CA = c(11047, 9925, 11322, 12251, 16610, 14696
), TX = c(7381, 5912, 9006, 6226, 9440, 9376), WI = c(6984, 3309,
8883, 9533, 11882, 8664), Total = c(25412, 19146, 29211, 28010,
37932, 32736), date = structure(c(1296518400, 1296604800, 1296691200,
1296777600, 1296864000, 1296950400), tzone = "UTC", class = c("POSIXct",
"POSIXt")), event_type = c("NA", "NA", "NA", "NA", "NA", "Sporting"
), snap_CA = c(1, 1, 1, 1, 1, 1), snap_TX = c(1, 0, 1, 0, 1,
1), snap_WI = c(0, 1, 1, 0, 1, 1)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
With the following code i am trying to calculate the average sales share of the three states on the company's total sales.
In addition, i need the same average percentages for each year, month of the year and day of the week.
install.packages("dplyr")
install.packages("lubridate")
library(dplyr)
library(lubridate)
df1 <- df %>%
dplyr::mutate(YEAR = lubridate::year(date),
MONTH = lubridate::month(date),
WEEKDAY = lubridate::wday(date),
P_CA = CA / Total,
P_TX = TX / Total,
P_WI = WI / Total)
# Average per Year
df1 %>%
dplyr::group_by(YEAR) %>%
dplyr::summarise(AV_CA = mean(P_CA, na.rm = TRUE),
AV_TX = mean(P_TX, na.rm = TRUE),
AV_WI = mean(P_WI, na.rm = TRUE))
# Average per Month
df1 %>%
dplyr::group_by(MONTH) %>%
dplyr::summarise(AV_CA = mean(P_CA, na.rm = TRUE),
AV_TX = mean(P_TX, na.rm = TRUE),
AV_WI = mean(P_WI, na.rm = TRUE))
# Average per Weekday
df1 %>%
dplyr::group_by(WEEKDAY) %>%
dplyr::summarise(AV_CA = mean(P_CA, na.rm = TRUE),
AV_TX = mean(P_TX, na.rm = TRUE),
AV_WI = mean(P_WI, na.rm = TRUE))
Output :
> df1 <- df %>%
+ dplyr::mutate(YEAR = lubridate::year(date),
+ MONTH = lubridate::month(date),
+ WEEKDAY = lubridate::wday(date),
+ P_CA = CA / Total,
+ P_TX = TX / Total,
+ P_WI = WI / Total)
Error in UseMethod("mutate_") :
no applicable method for 'mutate_' applied to an object of class "function"
> # Average per Year
> df1 %>%
+ dplyr::group_by(YEAR) %>%
+ dplyr::summarise(AV_CA = mean(P_CA, na.rm = TRUE),
+ AV_TX = mean(P_TX, na.rm = TRUE),
+ AV_WI = mean(P_WI, na.rm = TRUE))
Error in eval(lhs, parent, parent) : object 'df1' not found
It comes with an error : Error in UseMethod("mutate_") :
no applicable method for 'mutate_' applied to an object of class "function"
I cant figure out whats wrong , i double checked the code and the correctness of the data .
Please give a solution .
The issue would be that df is not created as an object in the global env and there is a function with name df if we do ?df
df(x, df1, df2, ncp, log = FALSE)
Basically, the error is based on applying mutate on a function df rather than an object
Checking on a fresh R session with no objects created
df %>%
dplyr::mutate(YEAR = lubridate::year(date),
MONTH = lubridate::month(date),
WEEKDAY = lubridate::wday(date),
P_CA = CA / Total,
P_TX = TX / Total,
P_WI = WI / Total)
Error in UseMethod("mutate_") :
no applicable method for 'mutate_' applied to an object of class "function"
Now, we define 'df' as
df <- WalData
df %>%
dplyr::mutate(YEAR = lubridate::year(date),
MONTH = lubridate::month(date),
WEEKDAY = lubridate::wday(date),
P_CA = CA / Total,
P_TX = TX / Total,
P_WI = WI / Total)
# A tibble: 6 x 15
# CA TX WI Total date event_type snap_CA snap_TX snap_WI YEAR MONTH WEEKDAY P_CA P_TX P_WI
# <dbl> <dbl> <dbl> <dbl> <dttm> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 11047 7381 6984 25412 2011-02-01 00:00:00 NA 1 1 0 2011 2 3 0.435 0.290 0.275
#2 9925 5912 3309 19146 2011-02-02 00:00:00 NA 1 0 1 2011 2 4 0.518 0.309 0.173
#3 11322 9006 8883 29211 2011-02-03 00:00:00 NA 1 1 1 2011 2 5 0.388 0.308 0.304
#4 12251 6226 9533 28010 2011-02-04 00:00:00 NA 1 0 0 2011 2 6 0.437 0.222 0.340
#5 16610 9440 11882 37932 2011-02-05 00:00:00 NA 1 1 1 2011 2 7 0.438 0.249 0.313
#6 14696 9376 8664 32736 2011-02-06 00:00:00 Sporting 1 1 1 2011 2 1 0.449 0.286 0.265

Group data into multiple season and boxplot side by side using ggplot in R?

I would like to group data into multiple seasin such that my season are winter: Dec - Feb; Spring: Mar - May; Summer: Jun -Aug, and Fall: Sep - Nov. I would then like to boxplot the Winter and Spring seasonal data comparing A to B and then A to C. Here is my laborious code so far. I would appreciate an efficient way of data grouping and plotting.
library(tidyverse)
library(reshape2)
Dates30s = data.frame(seq(as.Date("2011-01-01"), to= as.Date("2040-12-31"),by="day"))
colnames(Dates30s) = "date"
FakeData = data.frame(A = runif(10958, min = 0.5, max = 1.5), B = runif(10958, min = 1.6, max = 2), C = runif(10958, min = 0.8, max = 1.8))
myData = data.frame(Dates30s, FakeData)
myData = separate(myData, date, sep = "-", into = c("Year", "Month", "Day"))
myData$Year = as.numeric(myData$Year)
myData$Month = as.numeric(myData$Month)
SeasonalData = myData %>% group_by(Year, Month) %>% summarise_all(funs(mean)) %>% select(Year, Month, A, B, C)
Spring = SeasonalData %>% filter(Month == 3 | Month == 4 |Month == 5)
Winter1 = SeasonalData %>% filter(Month == 12)
Winter1$Year = Winter1$Year+1
Winter2 = SeasonalData %>% filter(Month == 1 | Month == 2 )
Winter = rbind(Winter1, Winter2) %>% filter(Year >= 2012 & Year <= 2040) %>% group_by(Year) %>% summarise_all(funs(mean)) %>% select(-"Month")
BoxData = gather(Winter, key = "Variable", value = "value", -Year )
ggplot(BoxData, aes(x=Variable, y=value,fill=factor(Variable)))+
geom_boxplot() + labs(title="Winter") +facet_wrap(~Variable)
I would like to have Two figures: Figure 1 split in two; one for Winter season and one for Summer season (see BoxPlot 1) and one for Monthly annual average representing average monthly values across the entire time period (2011 -2040) see Boxplot 2
This is what I usually do it. All calculation and plotting are based on water year (WY) or hydrologic year from October to September.
library(tidyverse)
library(lubridate)
set.seed(123)
Dates30s <- data.frame(seq(as.Date("2011-01-01"), to = as.Date("2040-12-31"), by = "day"))
colnames(Dates30s) <- "date"
FakeData <- data.frame(A = runif(10958, min = 0.3, max = 1.5),
B = runif(10958, min = 1.2, max = 2),
C = runif(10958, min = 0.6, max = 1.8))
### Calculate Year, Month then Water year (WY) and Season
myData <- data.frame(Dates30s, FakeData) %>%
mutate(Year = year(date),
MonthNr = month(date),
Month = month(date, label = TRUE, abbr = TRUE)) %>%
mutate(WY = case_when(MonthNr > 9 ~ Year + 1,
TRUE ~ Year)) %>%
mutate(Season = case_when(MonthNr %in% 9:11 ~ "Fall",
MonthNr %in% c(12, 1, 2) ~ "Winter",
MonthNr %in% 3:5 ~ "Spring",
TRUE ~ "Summer")) %>%
select(-date, -MonthNr, -Year) %>%
as_tibble()
myData
#> # A tibble: 10,958 x 6
#> A B C Month WY Season
#> <dbl> <dbl> <dbl> <ord> <dbl> <chr>
#> 1 0.645 1.37 1.51 Jan 2011 Winter
#> 2 1.25 1.79 1.71 Jan 2011 Winter
#> 3 0.791 1.35 1.68 Jan 2011 Winter
#> 4 1.36 1.97 0.646 Jan 2011 Winter
#> 5 1.43 1.31 1.60 Jan 2011 Winter
#> 6 0.355 1.52 0.708 Jan 2011 Winter
#> 7 0.934 1.94 0.825 Jan 2011 Winter
#> 8 1.37 1.89 1.03 Jan 2011 Winter
#> 9 0.962 1.75 0.632 Jan 2011 Winter
#> 10 0.848 1.94 0.883 Jan 2011 Winter
#> # ... with 10,948 more rows
Calculate seasonal and monthly average by WY
### Seasonal Avg by WY
SeasonalAvg <- myData %>%
select(-Month) %>%
group_by(WY, Season) %>%
summarise_all(mean, na.rm = TRUE) %>%
ungroup() %>%
gather(key = "State", value = "MFI", -WY, -Season)
SeasonalAvg
#> # A tibble: 366 x 4
#> WY Season State MFI
#> <dbl> <chr> <chr> <dbl>
#> 1 2011 Fall A 0.939
#> 2 2011 Spring A 0.907
#> 3 2011 Summer A 0.896
#> 4 2011 Winter A 0.909
#> 5 2012 Fall A 0.895
#> 6 2012 Spring A 0.865
#> 7 2012 Summer A 0.933
#> 8 2012 Winter A 0.895
#> 9 2013 Fall A 0.879
#> 10 2013 Spring A 0.872
#> # ... with 356 more rows
### Monthly Avg by WY
MonthlyAvg <- myData %>%
select(-Season) %>%
group_by(WY, Month) %>%
summarise_all(mean, na.rm = TRUE) %>%
ungroup() %>%
gather(key = "State", value = "MFI", -WY, -Month) %>%
mutate(Month = factor(Month))
MonthlyAvg
#> # A tibble: 1,080 x 4
#> WY Month State MFI
#> <dbl> <ord> <chr> <dbl>
#> 1 2011 Jan A 1.00
#> 2 2011 Feb A 0.807
#> 3 2011 Mar A 0.910
#> 4 2011 Apr A 0.923
#> 5 2011 May A 0.888
#> 6 2011 Jun A 0.876
#> 7 2011 Jul A 0.909
#> 8 2011 Aug A 0.903
#> 9 2011 Sep A 0.939
#> 10 2012 Jan A 0.903
#> # ... with 1,070 more rows
Plot seasonal and monthly data
### Seasonal plot
s1 <- ggplot(SeasonalAvg, aes(x = Season, y = MFI, color = State)) +
geom_boxplot(position = position_dodge(width = 0.7)) +
geom_point(position = position_jitterdodge(seed = 123))
s1
### Monthly plot
m1 <- ggplot(MonthlyAvg, aes(x = Month, y = MFI, color = State)) +
geom_boxplot(position = position_dodge(width = 0.7)) +
geom_point(position = position_jitterdodge(seed = 123))
m1
Bonus
### https://stackoverflow.com/a/58369424/786542
# if (!require(devtools)) {
# install.packages('devtools')
# }
# devtools::install_github('erocoar/gghalves')
library(gghalves)
s2 <- ggplot(SeasonalAvg, aes(x = Season, y = MFI, color = State)) +
geom_half_boxplot(nudge = 0.05) +
geom_half_violin(aes(fill = State),
side = "r", nudge = 0.01) +
theme_light() +
theme(legend.position = "bottom") +
guides(fill = guide_legend(nrow = 1))
s2
s3 <- ggplot(SeasonalAvg, aes(x = Season, y = MFI, color = State)) +
geom_half_boxplot(nudge = 0.05, outlier.color = NA) +
geom_dotplot(aes(fill = State),
binaxis = "y", method = "histodot",
dotsize = 0.35,
stackdir = "up", position = PositionDodge) +
theme_light() +
theme(legend.position = "bottom") +
guides(color = guide_legend(nrow = 1))
s3
#> `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
Created on 2019-10-16 by the reprex package (v0.3.0)

Stacked bar chart with percentage labels

I created a stacked bar chart where the bars represent a percentage of the population. I would like to add labels to the 65+ category (or for all 3 categories if it is not possible to do it just for 1 category) showing the % value for each year. If I add geom_text(label = datm$value), the bars become extremely small because the labels represent absolute values instead of percentages. This is my code:
dat <- read.table(text = "2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018
0-20 24.0 23.9 23.7 23.5 23.3 23.1 22.9 22.7 22.5 22.3 22.2
20-65 61.3 61.2 61.0 60.9 60.5 60.1 59.8 59.6 59.3 59.1 59.0
65+ 14.8 15.0 15.3 15.6 16.2 16.8 17.4 17.7 18.2 18.5 18.8", sep = " ", header = TRUE)
library(reshape)
datm <- melt(cbind(dat, ind = rownames(dat)), id.vars = c('ind'))
library(scales)
library(ggplot2)
ggplot(datm,aes(x = variable, y = value, fill = ind)) +
geom_bar(position = "fill",stat = "identity") +
scale_x_discrete(labels = c('2008', '2009', '2010', '2011', '2012', '2013',
'2014', '2015', '2016', '2017', '2018')) +
scale_y_continuous(labels = percent_format()) +
xlab('Year') +
ylab('% of population') +
ggtitle('Demographic trend in the Netherlands') +
scale_fill_manual(values = c("green", "blue", "darkgray"))
You can try this. Explanations in comments below:
library(dplyr)
# calculate percentage within each year
datm2 <- datm %>%
group_by(variable) %>%
mutate(p = value / sum(value)) %>%
ungroup()
> head(datm2)
# A tibble: 6 x 4
ind variable value p
<fct> <fct> <dbl> <dbl>
1 0-20 X2008 24 0.240
2 20-65 X2008 61.3 0.612
3 65+ X2008 14.8 0.148
4 0-20 X2009 23.9 0.239
5 20-65 X2009 61.2 0.611
6 65+ X2009 15 0.150
ggplot(datm2, aes(x = variable, y = value, fill = ind)) +
geom_col(position = "fill") + # geom_col is equivalent to geom_bar(stat = "identity")
geom_text(aes(label = scales::percent(p), # add layer for percentage values
alpha = ifelse(ind == "65+", 1, 0)), # only visible for 65+ category
position = position_fill(vjust = 0.5)) + # follow barplot's position
scale_x_discrete(labels = c('2008', '2009', '2010', '2011', '2012', '2013',
'2014', '2015', '2016', '2017', '2018')) +
scale_y_continuous(labels = percent_format()) +
scale_alpha_identity() +
xlab('Year') +
ylab('% of population') +
ggtitle('Demographic trend in the Netherlands') +
scale_fill_manual(values = c("green", "blue", "darkgray"))

How to change the linetype of one of two lines

I would like to change the linetype of one of my two lines in the plot, only making "line1" into a dashed one.
My plot:
My data looks like as bellow:
Year Sex value Rate Group
<dbl> <chr> <dbl> <dbl> <chr>
1 1912 Female 18 1.14 A
2 1912 Male 52 0.893 L
3 1913 Female 25 1.02 A
4 1913 Male 42 1.05 L
5 1914 Female 14 1.26 A
6 1914 Male 67 1.29 L
7 1915 Female 25 1.32 A
8 1915 Male 61 1.45 L
9 1916 Female 32 1.52 A
10 1916 Male 71 1.64 L
11 1917 Female 42 2.01 A
12 1917 Male 92 1.87 L
My code:
data %>% ggplot() +
geom_bar(aes(x = Year, y = value, fill= Sex), stat = "identity",
width=0.8,
alpha=0.8) +
geom_line(aes(x = Year, y = Rate * 100, colour= Group),
size = 1.0) +
scale_colour_manual(labels = c("line1","line2"),
values = c("red","blue"))+
scale_fill_manual(values = c("Female"="green","Male"="black"))+
guides(fill=guide_legend(title = "Number"),
color=guide_legend(title= "Ratio"))
Could anyone help? I tried for quite a while but failed. Thanks in advance.
As suggested in comments, and using the "name" term in each scale_*_manual to specify the legend names:
data %>%
ggplot() +
geom_bar(aes(x = Year, y = value, fill= Sex), stat = "identity",
width=0.8,
alpha=0.8) +
geom_line(aes(x = Year, y = Rate * 100,
colour = Group, linetype = Group),
size = 1.0) +
scale_colour_manual(name = "Ratio",
labels = c("line1","line2"),
values = c("red","blue"))+
scale_linetype_manual(name = "Ratio",
labels = c("line1","line2"),
values = c("dashed","solid"))+
scale_fill_manual(name = "Number",
values = c("Female"="green","Male"="black"))

Resources