Correlation Coefficients between population and suicide in Rstudio

Correlation Coefficients between population and suicide in Rstudio - r

Percentage Change
selectedCountry <- dataset %>%
filter(country == 'Japan') %>%
select(population, suicides_no, year) %>%
group_by(year) %>%
summarise(s_count = sum(suicides_no), p_count = sum(population))
year s_count p_count
<dbl> <dbl> <dbl>
1 1979 20711 107268500
2 1980 20416 108473500
3 1981 19976 109674700
4 1982 20535 110722900
5 1983 24853 111070000
6 1984 24221 111950000
I want to find the correlation between population and suicide after I aggregate population and suicide like this
percentage <- selectedCountry %>%
arrange(year) %>%
mutate(pct.chg.s = 100 * (s_count - lag(s_count,default=first(s_count))) / lag(s_count,default=first(s_count))) %>%
mutate(pct.chg.p = 100 * (p_count - lag(p_count,default=first(p_count))) / lag(p_count,default=first(p_count))) %>%
mutate(correlation = cor(pct.chg.s, pct.chg.p))
head(percentage)
I end up with a result
year s_count p_count pct.chg.s pct.chg.p correlation
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1979 20711 107268500 0 0 0.00789
2 1980 20416 108473500 -1.42 1.12 0.00789
3 1981 19976 109674700 -2.16 1.11 0.00789
4 1982 20535 110722900 2.80 0.956 0.00789
5 1983 24853 111070000 21.0 0.313 0.00789
6 1984 24221 111950000 -2.54 0.792 0.00789
to plot the corr between two variables using two way (I can't sure if this true or false)
ggscatter(percentage, x = "pct.chg.s", y = "pct.chg.p",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "Percentage Change in Suicide",
ylab = "Percentage Change in Population",
color = "blue", shape = 19,
palette = c("#00AFBB", "#E7B800", "#FC4E07"),
ellipse = TRUE, mean.point = TRUE,
star.plot = TRUE)
ggplot(percentage, aes(x = pct.chg.p, y = pct.chg.s)) +
geom_point() +
geom_smooth(method = "lm",formula = y ~ x) +
labs(title = "Correlation between Pubulation and Suicides",
x = "Percentage Change in Suicide",
y = "Percentage Change in Population")

Related

Predicted values from a log model using add_predictions() are a long way off

I've fit a log model to some data, which works nicely:
model_data <- structure(list(standard_conc_ngul = c(50, 50, 50, 5, 5, 0.5,
0.5, 0.05, 0.05, 0.005, 0.005, 0.005), ct = c(18.3305377960205,
18.133768081665, 17.8813705444336, 21.5002365112305, 21.4915542602539,
22.7616996765137, 23.6836719512939, 25.3699340820312, 25.3488445281982,
28.984302520752, 26.7397594451904, 27.8844776153564)), row.names = c(NA,
-12L), class = c("tbl_df", "tbl", "data.frame"))
> print(model_data)
# A tibble: 12 × 2
standard_conc_ngul ct
<dbl> <dbl>
1 50 18.3
2 50 18.1
3 50 17.9
4 5 21.5
5 5 21.5
6 0.5 22.8
7 0.5 23.7
8 0.05 25.4
9 0.05 25.3
10 0.005 29.0
11 0.005 26.7
12 0.005 27.9
model <- lm(log(standard_conc_ngul) ~ ct, data = model_data)
> summary(model)$r.squared
[1] 0.9727793
ggplot(model_data, aes(x = standard_conc_ngul, y = ct)) +
geom_point() +
stat_smooth(method = lm, formula = y ~ log(x)) +
stat_poly_eq(formula = y ~ log(x),
aes(label = paste(after_stat(eq.label), after_stat(rr.label), sep = "~~~")),
parse = TRUE, coef.digits = 3, f.digits = 3, p.digits = 3,
rr.digits = 3)
I'd now like to use this model to predict standard_conc_ngul from ct in new data. Before I do that I wanted to test the predictions in the same model data above. However, when I do it they're nowhere near:
library(modelr)
model_data %>%
add_predictions(model, var = "standard_conc_ngul_predicted") %>%
ggplot(aes(x = standard_conc_ngul, y = standard_conc_ngul_predicted)) +
geom_point()
Any idea what's going on??

The predictions are calculating the natural log of your outcome to match the equation. Correcting them using exp gives comparable values:
library(tidyverse)
model_data %>%
modelr::add_predictions(model, var = "standard_conc_ngul_predicted") |>
mutate(pred = exp(standard_conc_ngul_predicted)) |>
ggplot(aes(x = standard_conc_ngul, y = pred)) +
geom_point()
Is this what you're looking for?

Calculation of "average sales share " with dplyr::mutate

My data concerns a company and includes Total Sales and the amount of sales in three counties CA , TX and WI.
Data :
> dput(head(WalData))
structure(list(CA = c(11047, 9925, 11322, 12251, 16610, 14696
), TX = c(7381, 5912, 9006, 6226, 9440, 9376), WI = c(6984, 3309,
8883, 9533, 11882, 8664), Total = c(25412, 19146, 29211, 28010,
37932, 32736), date = structure(c(1296518400, 1296604800, 1296691200,
1296777600, 1296864000, 1296950400), tzone = "UTC", class = c("POSIXct",
"POSIXt")), event_type = c("NA", "NA", "NA", "NA", "NA", "Sporting"
), snap_CA = c(1, 1, 1, 1, 1, 1), snap_TX = c(1, 0, 1, 0, 1,
1), snap_WI = c(0, 1, 1, 0, 1, 1)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
With the following code i am trying to calculate the average sales share of the three states on the company's total sales.
In addition, i need the same average percentages for each year, month of the year and day of the week.
install.packages("dplyr")
install.packages("lubridate")
library(dplyr)
library(lubridate)
df1 <- df %>%
dplyr::mutate(YEAR = lubridate::year(date),
MONTH = lubridate::month(date),
WEEKDAY = lubridate::wday(date),
P_CA = CA / Total,
P_TX = TX / Total,
P_WI = WI / Total)
# Average per Year
df1 %>%
dplyr::group_by(YEAR) %>%
dplyr::summarise(AV_CA = mean(P_CA, na.rm = TRUE),
AV_TX = mean(P_TX, na.rm = TRUE),
AV_WI = mean(P_WI, na.rm = TRUE))
# Average per Month
df1 %>%
dplyr::group_by(MONTH) %>%
dplyr::summarise(AV_CA = mean(P_CA, na.rm = TRUE),
AV_TX = mean(P_TX, na.rm = TRUE),
AV_WI = mean(P_WI, na.rm = TRUE))
# Average per Weekday
df1 %>%
dplyr::group_by(WEEKDAY) %>%
dplyr::summarise(AV_CA = mean(P_CA, na.rm = TRUE),
AV_TX = mean(P_TX, na.rm = TRUE),
AV_WI = mean(P_WI, na.rm = TRUE))
Output :
> df1 <- df %>%
+ dplyr::mutate(YEAR = lubridate::year(date),
+ MONTH = lubridate::month(date),
+ WEEKDAY = lubridate::wday(date),
+ P_CA = CA / Total,
+ P_TX = TX / Total,
+ P_WI = WI / Total)
Error in UseMethod("mutate_") :
no applicable method for 'mutate_' applied to an object of class "function"
> # Average per Year
> df1 %>%
+ dplyr::group_by(YEAR) %>%
+ dplyr::summarise(AV_CA = mean(P_CA, na.rm = TRUE),
+ AV_TX = mean(P_TX, na.rm = TRUE),
+ AV_WI = mean(P_WI, na.rm = TRUE))
Error in eval(lhs, parent, parent) : object 'df1' not found
It comes with an error : Error in UseMethod("mutate_") :
no applicable method for 'mutate_' applied to an object of class "function"
I cant figure out whats wrong , i double checked the code and the correctness of the data .
Please give a solution .

The issue would be that df is not created as an object in the global env and there is a function with name df if we do ?df
df(x, df1, df2, ncp, log = FALSE)
Basically, the error is based on applying mutate on a function df rather than an object
Checking on a fresh R session with no objects created
df %>%
dplyr::mutate(YEAR = lubridate::year(date),
MONTH = lubridate::month(date),
WEEKDAY = lubridate::wday(date),
P_CA = CA / Total,
P_TX = TX / Total,
P_WI = WI / Total)
Error in UseMethod("mutate_") :
no applicable method for 'mutate_' applied to an object of class "function"
Now, we define 'df' as
df <- WalData
df %>%
dplyr::mutate(YEAR = lubridate::year(date),
MONTH = lubridate::month(date),
WEEKDAY = lubridate::wday(date),
P_CA = CA / Total,
P_TX = TX / Total,
P_WI = WI / Total)
# A tibble: 6 x 15
# CA TX WI Total date event_type snap_CA snap_TX snap_WI YEAR MONTH WEEKDAY P_CA P_TX P_WI
# <dbl> <dbl> <dbl> <dbl> <dttm> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 11047 7381 6984 25412 2011-02-01 00:00:00 NA 1 1 0 2011 2 3 0.435 0.290 0.275
#2 9925 5912 3309 19146 2011-02-02 00:00:00 NA 1 0 1 2011 2 4 0.518 0.309 0.173
#3 11322 9006 8883 29211 2011-02-03 00:00:00 NA 1 1 1 2011 2 5 0.388 0.308 0.304
#4 12251 6226 9533 28010 2011-02-04 00:00:00 NA 1 0 0 2011 2 6 0.437 0.222 0.340
#5 16610 9440 11882 37932 2011-02-05 00:00:00 NA 1 1 1 2011 2 7 0.438 0.249 0.313
#6 14696 9376 8664 32736 2011-02-06 00:00:00 Sporting 1 1 1 2011 2 1 0.449 0.286 0.265

Group data into multiple season and boxplot side by side using ggplot in R?

I would like to group data into multiple seasin such that my season are winter: Dec - Feb; Spring: Mar - May; Summer: Jun -Aug, and Fall: Sep - Nov. I would then like to boxplot the Winter and Spring seasonal data comparing A to B and then A to C. Here is my laborious code so far. I would appreciate an efficient way of data grouping and plotting.
library(tidyverse)
library(reshape2)
Dates30s = data.frame(seq(as.Date("2011-01-01"), to= as.Date("2040-12-31"),by="day"))
colnames(Dates30s) = "date"
FakeData = data.frame(A = runif(10958, min = 0.5, max = 1.5), B = runif(10958, min = 1.6, max = 2), C = runif(10958, min = 0.8, max = 1.8))
myData = data.frame(Dates30s, FakeData)
myData = separate(myData, date, sep = "-", into = c("Year", "Month", "Day"))
myData$Year = as.numeric(myData$Year)
myData$Month = as.numeric(myData$Month)
SeasonalData = myData %>% group_by(Year, Month) %>% summarise_all(funs(mean)) %>% select(Year, Month, A, B, C)
Spring = SeasonalData %>% filter(Month == 3 | Month == 4 |Month == 5)
Winter1 = SeasonalData %>% filter(Month == 12)
Winter1$Year = Winter1$Year+1
Winter2 = SeasonalData %>% filter(Month == 1 | Month == 2 )
Winter = rbind(Winter1, Winter2) %>% filter(Year >= 2012 & Year <= 2040) %>% group_by(Year) %>% summarise_all(funs(mean)) %>% select(-"Month")
BoxData = gather(Winter, key = "Variable", value = "value", -Year )
ggplot(BoxData, aes(x=Variable, y=value,fill=factor(Variable)))+
geom_boxplot() + labs(title="Winter") +facet_wrap(~Variable)
I would like to have Two figures: Figure 1 split in two; one for Winter season and one for Summer season (see BoxPlot 1) and one for Monthly annual average representing average monthly values across the entire time period (2011 -2040) see Boxplot 2

This is what I usually do it. All calculation and plotting are based on water year (WY) or hydrologic year from October to September.
library(tidyverse)
library(lubridate)
set.seed(123)
Dates30s <- data.frame(seq(as.Date("2011-01-01"), to = as.Date("2040-12-31"), by = "day"))
colnames(Dates30s) <- "date"
FakeData <- data.frame(A = runif(10958, min = 0.3, max = 1.5),
B = runif(10958, min = 1.2, max = 2),
C = runif(10958, min = 0.6, max = 1.8))
### Calculate Year, Month then Water year (WY) and Season
myData <- data.frame(Dates30s, FakeData) %>%
mutate(Year = year(date),
MonthNr = month(date),
Month = month(date, label = TRUE, abbr = TRUE)) %>%
mutate(WY = case_when(MonthNr > 9 ~ Year + 1,
TRUE ~ Year)) %>%
mutate(Season = case_when(MonthNr %in% 9:11 ~ "Fall",
MonthNr %in% c(12, 1, 2) ~ "Winter",
MonthNr %in% 3:5 ~ "Spring",
TRUE ~ "Summer")) %>%
select(-date, -MonthNr, -Year) %>%
as_tibble()
myData
#> # A tibble: 10,958 x 6
#> A B C Month WY Season
#> <dbl> <dbl> <dbl> <ord> <dbl> <chr>
#> 1 0.645 1.37 1.51 Jan 2011 Winter
#> 2 1.25 1.79 1.71 Jan 2011 Winter
#> 3 0.791 1.35 1.68 Jan 2011 Winter
#> 4 1.36 1.97 0.646 Jan 2011 Winter
#> 5 1.43 1.31 1.60 Jan 2011 Winter
#> 6 0.355 1.52 0.708 Jan 2011 Winter
#> 7 0.934 1.94 0.825 Jan 2011 Winter
#> 8 1.37 1.89 1.03 Jan 2011 Winter
#> 9 0.962 1.75 0.632 Jan 2011 Winter
#> 10 0.848 1.94 0.883 Jan 2011 Winter
#> # ... with 10,948 more rows
Calculate seasonal and monthly average by WY
### Seasonal Avg by WY
SeasonalAvg <- myData %>%
select(-Month) %>%
group_by(WY, Season) %>%
summarise_all(mean, na.rm = TRUE) %>%
ungroup() %>%
gather(key = "State", value = "MFI", -WY, -Season)
SeasonalAvg
#> # A tibble: 366 x 4
#> WY Season State MFI
#> <dbl> <chr> <chr> <dbl>
#> 1 2011 Fall A 0.939
#> 2 2011 Spring A 0.907
#> 3 2011 Summer A 0.896
#> 4 2011 Winter A 0.909
#> 5 2012 Fall A 0.895
#> 6 2012 Spring A 0.865
#> 7 2012 Summer A 0.933
#> 8 2012 Winter A 0.895
#> 9 2013 Fall A 0.879
#> 10 2013 Spring A 0.872
#> # ... with 356 more rows
### Monthly Avg by WY
MonthlyAvg <- myData %>%
select(-Season) %>%
group_by(WY, Month) %>%
summarise_all(mean, na.rm = TRUE) %>%
ungroup() %>%
gather(key = "State", value = "MFI", -WY, -Month) %>%
mutate(Month = factor(Month))
MonthlyAvg
#> # A tibble: 1,080 x 4
#> WY Month State MFI
#> <dbl> <ord> <chr> <dbl>
#> 1 2011 Jan A 1.00
#> 2 2011 Feb A 0.807
#> 3 2011 Mar A 0.910
#> 4 2011 Apr A 0.923
#> 5 2011 May A 0.888
#> 6 2011 Jun A 0.876
#> 7 2011 Jul A 0.909
#> 8 2011 Aug A 0.903
#> 9 2011 Sep A 0.939
#> 10 2012 Jan A 0.903
#> # ... with 1,070 more rows
Plot seasonal and monthly data
### Seasonal plot
s1 <- ggplot(SeasonalAvg, aes(x = Season, y = MFI, color = State)) +
geom_boxplot(position = position_dodge(width = 0.7)) +
geom_point(position = position_jitterdodge(seed = 123))
s1
### Monthly plot
m1 <- ggplot(MonthlyAvg, aes(x = Month, y = MFI, color = State)) +
geom_boxplot(position = position_dodge(width = 0.7)) +
geom_point(position = position_jitterdodge(seed = 123))
m1
Bonus
### https://stackoverflow.com/a/58369424/786542
# if (!require(devtools)) {
# install.packages('devtools')
# }
# devtools::install_github('erocoar/gghalves')
library(gghalves)
s2 <- ggplot(SeasonalAvg, aes(x = Season, y = MFI, color = State)) +
geom_half_boxplot(nudge = 0.05) +
geom_half_violin(aes(fill = State),
side = "r", nudge = 0.01) +
theme_light() +
theme(legend.position = "bottom") +
guides(fill = guide_legend(nrow = 1))
s2
s3 <- ggplot(SeasonalAvg, aes(x = Season, y = MFI, color = State)) +
geom_half_boxplot(nudge = 0.05, outlier.color = NA) +
geom_dotplot(aes(fill = State),
binaxis = "y", method = "histodot",
dotsize = 0.35,
stackdir = "up", position = PositionDodge) +
theme_light() +
theme(legend.position = "bottom") +
guides(color = guide_legend(nrow = 1))
s3
#> `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
Created on 2019-10-16 by the reprex package (v0.3.0)

Stacked bar chart with percentage labels

I created a stacked bar chart where the bars represent a percentage of the population. I would like to add labels to the 65+ category (or for all 3 categories if it is not possible to do it just for 1 category) showing the % value for each year. If I add geom_text(label = datm$value), the bars become extremely small because the labels represent absolute values instead of percentages. This is my code:
dat <- read.table(text = "2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018
0-20 24.0 23.9 23.7 23.5 23.3 23.1 22.9 22.7 22.5 22.3 22.2
20-65 61.3 61.2 61.0 60.9 60.5 60.1 59.8 59.6 59.3 59.1 59.0
65+ 14.8 15.0 15.3 15.6 16.2 16.8 17.4 17.7 18.2 18.5 18.8", sep = " ", header = TRUE)
library(reshape)
datm <- melt(cbind(dat, ind = rownames(dat)), id.vars = c('ind'))
library(scales)
library(ggplot2)
ggplot(datm,aes(x = variable, y = value, fill = ind)) +
geom_bar(position = "fill",stat = "identity") +
scale_x_discrete(labels = c('2008', '2009', '2010', '2011', '2012', '2013',
'2014', '2015', '2016', '2017', '2018')) +
scale_y_continuous(labels = percent_format()) +
xlab('Year') +
ylab('% of population') +
ggtitle('Demographic trend in the Netherlands') +
scale_fill_manual(values = c("green", "blue", "darkgray"))

You can try this. Explanations in comments below:
library(dplyr)
# calculate percentage within each year
datm2 <- datm %>%
group_by(variable) %>%
mutate(p = value / sum(value)) %>%
ungroup()
> head(datm2)
# A tibble: 6 x 4
ind variable value p
<fct> <fct> <dbl> <dbl>
1 0-20 X2008 24 0.240
2 20-65 X2008 61.3 0.612
3 65+ X2008 14.8 0.148
4 0-20 X2009 23.9 0.239
5 20-65 X2009 61.2 0.611
6 65+ X2009 15 0.150
ggplot(datm2, aes(x = variable, y = value, fill = ind)) +
geom_col(position = "fill") + # geom_col is equivalent to geom_bar(stat = "identity")
geom_text(aes(label = scales::percent(p), # add layer for percentage values
alpha = ifelse(ind == "65+", 1, 0)), # only visible for 65+ category
position = position_fill(vjust = 0.5)) + # follow barplot's position
scale_x_discrete(labels = c('2008', '2009', '2010', '2011', '2012', '2013',
'2014', '2015', '2016', '2017', '2018')) +
scale_y_continuous(labels = percent_format()) +
scale_alpha_identity() +
xlab('Year') +
ylab('% of population') +
ggtitle('Demographic trend in the Netherlands') +
scale_fill_manual(values = c("green", "blue", "darkgray"))

How to change the linetype of one of two lines

I would like to change the linetype of one of my two lines in the plot, only making "line1" into a dashed one.
My plot:
My data looks like as bellow:
Year Sex value Rate Group
<dbl> <chr> <dbl> <dbl> <chr>
1 1912 Female 18 1.14 A
2 1912 Male 52 0.893 L
3 1913 Female 25 1.02 A
4 1913 Male 42 1.05 L
5 1914 Female 14 1.26 A
6 1914 Male 67 1.29 L
7 1915 Female 25 1.32 A
8 1915 Male 61 1.45 L
9 1916 Female 32 1.52 A
10 1916 Male 71 1.64 L
11 1917 Female 42 2.01 A
12 1917 Male 92 1.87 L
My code:
data %>% ggplot() +
geom_bar(aes(x = Year, y = value, fill= Sex), stat = "identity",
width=0.8,
alpha=0.8) +
geom_line(aes(x = Year, y = Rate * 100, colour= Group),
size = 1.0) +
scale_colour_manual(labels = c("line1","line2"),
values = c("red","blue"))+
scale_fill_manual(values = c("Female"="green","Male"="black"))+
guides(fill=guide_legend(title = "Number"),
color=guide_legend(title= "Ratio"))
Could anyone help? I tried for quite a while but failed. Thanks in advance.

As suggested in comments, and using the "name" term in each scale_*_manual to specify the legend names:
data %>%
ggplot() +
geom_bar(aes(x = Year, y = value, fill= Sex), stat = "identity",
width=0.8,
alpha=0.8) +
geom_line(aes(x = Year, y = Rate * 100,
colour = Group, linetype = Group),
size = 1.0) +
scale_colour_manual(name = "Ratio",
labels = c("line1","line2"),
values = c("red","blue"))+
scale_linetype_manual(name = "Ratio",
labels = c("line1","line2"),
values = c("dashed","solid"))+
scale_fill_manual(name = "Number",
values = c("Female"="green","Male"="black"))

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Correlation Coefficients between population and suicide in Rstudio - r

Related

Predicted values from a log model using add_predictions() are a long way off

Calculation of "average sales share " with dplyr::mutate

Group data into multiple season and boxplot side by side using ggplot in R?

Stacked bar chart with percentage labels

How to change the linetype of one of two lines

Categories

Resources