How to use summarise function in dplyr with multiple conditions? - r

I am working with a time-series cross-country dataset covering the period from 2003 to 2018. Each entry in the database corresponds to a protest event, the number of participants, level of engagement of the security services, and level of participant violence. I have multiple observations per year per country. I want to create a new df that counts the number of protests for each country (Count), the average number of participants (AvgParticipants), the average security services engagement (AvgSecurity), and the average level of participant violence (AvgPartViolence). Here is the code I have written thus far:
# Creating Yearly Protest Count Data
# Load packages
library(dplyr)
# Set working directory
setwd("~/Desktop/Cooptation and Protest")
# Load data
dat <- read.csv("reports.csv")
# Subset to relevant variables
dat <- dat %>%
select(cowcode, event_date, side, scope, part_violence, sec_engagement,
numparticipants)
# Convert event_date to only year
dat$event_date <- as.Date(dat$event_date)
dat$year <- as.numeric(format(dat$event_date,'%Y'))
my_summary_data <- dat %>%
group_by(year, cowcode) %>%
summarise(Count = n()) %>%
summarise(AvgSecurity = mean(sec_engagement)) %>%
summarise(AvgPartviolence = mean(part_violence))
I have no issue when I run summarise(Count = n()), but I can't get running summarise(AvgSecurity = mean(sec_engagement)) and summarise(AvgPartviolence = mean(part_violence)) to work. Any advice would be appreciated. Below are some data for your convenience.
structure(list(cowcode = c(40L, 40L, 40L, 40L, 40L, 40L), event_date = structure(c(12183,
15302, 12173, 12173, 12393, 12583), class = "Date"), side = c(0L,
1L, 0L, 0L, 0L, 0L), scope = c(0L, 0L, 0L, 0L, 0L, 0L), part_violence = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_
), sec_engagement = c(NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_), numparticipants = c("",
"", "", "", "2000", ""), year = c(2003, 2011, 2003, 2003, 2003,
2004)), row.names = c(NA, 6L), class = "data.frame")

The comment has it!
library(tidyverse)
dat <- structure(list(cowcode = c(40L, 40L, 40L, 40L, 40L, 40L), event_date = structure(c(12183,
15302, 12173, 12173, 12393, 12583), class = "Date"), side = c(0L,
1L, 0L, 0L, 0L, 0L), scope = c(0L, 0L, 0L, 0L, 0L, 0L), part_violence = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_
), sec_engagement = c(NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_), numparticipants = c("",
"", "", "", "2000", ""), year = c(2003, 2011, 2003, 2003, 2003,
2004)), row.names = c(NA, 6L), class = "data.frame")
dat$event_date <- as.Date(dat$event_date)
dat$year <- as.numeric(format(dat$event_date,'%Y'))
my_summary_data <- dat %>%
group_by(year, cowcode) %>%
summarise(Count = n(),
AvgSecurity = mean(sec_engagement),
AvgPartviolence = mean(part_violence))
my_summary_data

Related

mistake when using dplyr, trying to plot a variable in proportion of the total

I have a dataset which has the following structure < dput(head(df)) > :
structure(list(type_de_sejour = c("Amb", "Hosp",
"Hosp", "Amb", "Hosp", "Sea"),
specialite = c("ANES", "ANES",
"Autres", "CARD", "CARD", "CARD"
), CA_annee_N = c(2712L, 122180L, 0L, 822615L, 6905494L,
0L), nb_sejours_N = c(8L, 32L, 0L, 1052L, 2776L, 0L), nb_doc_N = c(5L,
8L, 0L, 12L, 15L, 0L), CA_annee_N1 = c(4231L, 78858L, 6587L,
327441L, 6413083L, 0L), nb_sejours_N1 = c(13L, 29L, 2L, 532L,
2819L, 0L), nb_doc_N1 = c(6L, 9L, 1L, 12L, 12L, 0L
), CA_annee_N2 = c(4551L, 27432L, 0L, 208326L, 7465440L,
575L), nb_sejours_N2 = c(15L, 8L, 0L, 463L, 3393L, 1L), nb_doc_N2 = c(6L,
4L, 0L, 11L, 13L, 1L), site = c("FR", "FR", "FR", "FR",
"FR", "FR")), row.names = c(NA, 6L), class = "data.frame")
I am trying to plot a graph showing the percentage each "specialite" (distinguishing per "site", ideally by faceting or doing 2 plots, one per site) represents in the total "nb_sejours_N", after having filtered by type_de_sejour == "Amb".
I have tried the following code :
df %>%
mutate(volume_N == nb_sejours_N,
volume_N1 == nb_sejours_N1,
volume_N2 == nb_sejours_N2)%>%
filter(type_de_sejour == "Amb")%>%
group_by(site) %>%
mutate(proportion_N = volume_N/sum(volume_N, na.rm = TRUE),
proportion_N1 = volume_N1/sum(volume_N1, na.rm = TRUE),
proportion_N2 = volume_N2/sum(volume_N2, na.rm = TRUE))
Unfortunately, it doesn't work, so I can't go any further. I would also like to know if anyone knows an efficient code to plot what I'm trying to represent ?
I believe the following works:
# creating plot
p = df %>% filter(type_de_sejour == "Amb") %>%
pivot_longer(cols = c("nb_sejours_N","nb_sejours_N1","nb_sejours_N2"), values_to = "visit") %>%
ggplot(aes(fill=name, y=visit, x=name)) + geom_bar(position="stack", stat="identity")
# creating summary of totals for each column
totals = df %>% filter(type_de_sejour == "Amb") %>%
pivot_longer(cols = c("nb_sejours_N","nb_sejours_N1","nb_sejours_N2"), values_to = "visit") %>%
group_by(name) %>% summarise(total = sum(visit))
# adding totals on top of bars to plot
p + geom_text(aes(name, total, label = total, fill = NULL), data = totals)

position=dodge in geom_col in barplot

here is a dataset of soccer players that I need to visualise the total number of yellow cards received next to the number of games played per country in one bar plot. SO I need to calculate the total number of yellow cards and the total number of games per league country and bring the data into long format.
dput(head(new_soccer_referee))
structure(list(playerShort = c("lucas-wilchez", "john-utaka",
"abdon-prats", "pablo-mari", "ruben-pena", "aaron-hughes"), player = c("Lucas Wilchez",
"John Utaka", " Abdón Prats", " Pablo Marí", " Rubén Peña", "Aaron Hughes"
), club = c("Real Zaragoza", "Montpellier HSC", "RCD Mallorca",
"RCD Mallorca", "Real Valladolid", "Fulham FC"), leagueCountry = c("Spain",
"France", "Spain", "Spain", "Spain", "England"), birthday = structure(c(4990,
4390, 8386, 8643, 7868, 3598), class = "Date"), height = c(177L,
179L, 181L, 191L, 172L, 182L), weight = c(72L, 82L, 79L, 87L,
70L, 71L), position = c("Attacking Midfielder", "Right Winger",
NA, "Center Back", "Right Midfielder", "Center Back"), games = c(1L,
1L, 1L, 1L, 1L, 1L), victories = c(0L, 0L, 0L, 1L, 1L, 0L), ties = c(0L,
0L, 1L, 0L, 0L, 0L), defeats = c(1L, 1L, 0L, 0L, 0L, 1L), goals = c(0L,
0L, 0L, 0L, 0L, 0L), yellowCards = c(0L, 1L, 1L, 0L, 0L, 0L),
yellowReds = c(0L, 0L, 0L, 0L, 0L, 0L), redCards = c(0L,
0L, 0L, 0L, 0L, 0L), photoID = c("95212.jpg", "1663.jpg",
NA, NA, NA, "3868.jpg"), rater1 = c(0.25, 0.75, NA, NA, NA,
0.25), rater2 = c(0.5, 0.75, NA, NA, NA, 0), refNum = c(1L,
2L, 3L, 3L, 3L, 4L), refCountry = c(1L, 2L, 3L, 3L, 3L, 4L
), Alpha_3 = c("GRC", "ZMB", "ESP", "ESP", "ESP", "LUX"),
meanIAT = c(0.326391469021736, 0.203374724564378, 0.369893594187172,
0.369893594187172, 0.369893594187172, 0.325185154120009),
nIAT = c(712L, 40L, 1785L, 1785L, 1785L, 127L), seIAT = c(0.000564112354334542,
0.0108748941063986, 0.000229489640866464, 0.000229489640866464,
0.000229489640866464, 0.00329680952361961), meanExp = c(0.396,
-0.204081632653061, 0.588297311544544, 0.588297311544544,
0.588297311544544, 0.538461538461538), nExp = c(750L, 49L,
1897L, 1897L, 1897L, 130L), seExp = c(0.0026964901062936,
0.0615044043187379, 0.00100164730649311, 0.00100164730649311,
0.00100164730649311, 0.013752210497518), BMI = c(22.98190175237,
25.5922099809619, 24.1140380330271, 23.8480304816206, 23.6614386154678,
21.4346093466973), position_new = c("Offense", "Offense",
"Goalkeeper", "Defense", "Midfield", "Defense"), rater_mean = c(0.375,
0.75, NA, NA, NA, 0.125), ageinyear = c(28, 30, 19, 18, 20,
32), ageinyears = c(28, 30, 19, 18, 20, 32)), row.names = c(NA,
6L), class = "data.frame")
Use the data to draw a bar plot with the following characteristics:
– The x-axis displays the league country while the y-axis displays the number of games and the number of cards
– For each country there are two bars next to each other: one for the games played and one for the cards received
barplot <- ggplot(new_soccer_referee,aes(x=leagueCountry,y=number))
barplot +
geom_bar(fill=c("games","yellowCards")) +
geom_col(Position="dodge") +
labels(x="leagueCountry", y="number")
ggplot
`
I know it is pretty messy but I am really confused how to build up the layers with ggplot and how to work out the long format, can anyone help?
One option would be to first aggregate your data to compute the number of yellowCards and games by leagueCountry. Afterwards you could convert to long which makes it easy to plot via ggplot2.
Using some fake random example data to mimic your real data:
set.seed(123)
new_soccer_referee <- data.frame(
player = sample(letters, 20),
leagueCountry = sample(c("Spain", "France", "England", "Italy"), 20, replace = TRUE),
yellowCards = sample(1:5, 20, replace = TRUE),
games = sample(1:20, 20, replace = TRUE)
)
library(dplyr)
library(tidyr)
library(ggplot2)
new_soccer_referee_long <- new_soccer_referee %>%
group_by(leagueCountry) %>%
summarise(across(c(yellowCards, games), sum)) %>%
pivot_longer(-leagueCountry, names_to = "variable", values_to = "number")
ggplot(new_soccer_referee_long, aes(leagueCountry, number, fill = variable)) +
geom_col(position = "dodge")
Something like this:
library(tidyverse)
new_soccer_referee %>%
select(leagueCountry, games, yellowCards) %>%
group_by(leagueCountry) %>%
summarise(games = sum(games),
yellowCars = sum(yellowCards)
) %>%
pivot_longer(-leagueCountry) %>%
ggplot(aes(x=leagueCountry, fill=name, y=value)) +
geom_col(position = position_dodge())

using if within filter_at()

DATA
df <- structure(list(ID = c("51-07519", "51-07522", "51-07525", "51-07526",
"51-07527", "51-07530"), name = c("Fyb", "Fyb", "Fyb", "Fyb",
"Fyb", "Fyb"), serology_charts = c(0L, 0L, NA, 0L, 1L, 1L), antibodies_chart = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_
), bioarray_charts = c(NA, 0L, NA, 0L, NA, NA), others_charts = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_
), Fyb = c(1, 1, 1, 1, 1, 1), GATAfactor = c(0, 0, 1, 0, 0.5,
0.5)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"
))
I currently run the following filter:
df%>%
filter_at(vars(ends_with("charts")), any_vars(!is.na(.) & . != Fyb*GATAfactor))
Is it possible to write an if statement as follows:
if Fyb!=1 {filter_at(vars(ends_with("charts")), any_vars(!is.na(.) & . != Fyb))}
else {filter_at(vars(ends_with("charts")), any_vars(!is.na(.) & . != Fyb*GATAfactor))}
We can wrap the condition in a case_when or ifelse
library(dplyr)
df %>%
filter_at(vars(ends_with("charts")),
any_vars(case_when(Fyb == 1 ~ !is.na(.) & . != Fyb*GATAfactor,
TRUE ~ !is.na(.) & . != Fyb)))
Or using ifelse
df %>%
filter_at(vars(ends_with("charts")),
any_vars(ifelse(Fyb == 1, !is.na(.) & . != Fyb*GATAfactor, !is.na(.) & . != Fyb)))

How can I rearrange the date from d-m-y to m-d-y in R?

I am having issues with the following R code. I am trying to rearrange csv date values in a column from day-month-year to month-day-year. To issues arise: the format is changed to year-month-day instead, and this error message appears when I attempt to plot the results:
Error: Column New_Date is a date/time and must be stored as POSIXct, not POSIXlt.
I am new to R and unsure on how to fix this error.
I have gone through a lot of similar topics, however because of lack of knowledge in R, I am unable to understand whether these topics can translate to my own code, and the information that I need.
Any help is much appreciated. The code is due relatively soon, so any fast responses are going to be worshipped. Thanks!
structure(list(Date = structure(c(48L, 11L, 36L, 35L, 1L, 14L
), .Label = c("01-02-18", "02-03-18", "02-10-18", "03-01-18",
"03-04-18", "03-05-18", "03-08-18", "03-09-18", "05-07-18", "05-12-18",
"07-02-18", "07-06-18", "07-11-18", "08-03-18", "09-01-18", "09-05-18",
"09-08-18", "09-10-18", "10-01-18", "10-04-18", "10-09-18", "11-07-18",
"12-11-18", "12-12-18", "13-02-18", "13-06-18", "14-03-18", "14-09-18",
"15-01-18", "15-05-18", "16-04-18", "16-08-18", "17-07-18", "18-12-18",
"19-01-18", "19-02-18", "19-06-18", "19-10-18", "19-11-18", "20-03-18",
"20-04-18", "20-08-18", "20-09-18", "21-05-18", "23-07-18", "23-11-18",
"24-12-18", "25-01-18", "25-02-18", "25-05-18", "25-06-18", "25-10-18",
"26-03-18", "26-09-18", "27-04-18", "29-08-18", "30-07-18", "31-05-18",
"31-10-18"), class = "factor"), New_Date = structure(list(sec = c(0,
0, 0, 0, 0, 0), min = c(0L, 0L, 0L, 0L, 0L, 0L), hour = c(0L,
0L, 0L, 0L, 0L, 0L), mday = c(25L, 7L, 19L, 19L, 1L, 8L), mon = c(0L,
1L, 1L, 0L, 1L, 2L), year = c(-1882L, -1882L, -1882L, -1882L,
-1882L, -1882L), wday = c(4L, 3L, 1L, 5L, 4L, 4L), yday = c(24L,
37L, 49L, 18L, 31L, 66L), isdst = c(0L, 0L, 0L, 0L, 0L, 0L),
zone = c("LMT", "LMT", "LMT", "LMT", "LMT", "LMT"), gmtoff = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_
)), class = c("POSIXlt", "POSIXt"))), row.names = c(NA, 6L
), class = "data.frame")
EDIT:
Now having this error appear: "'Error in plot.window(...) : need finite 'xlim' values"
Below is my code:
beaches$Date = as.Date(as.character(beaches$Date), '%d-%m-%y')
beaches$New_Date = format(beaches$Date, '%m-%d-%y')
Palm_beach = filter(beaches, Site == "Palm Beach")
Shelly_beach = filter(beaches, Site == "Shelly Beach (Manly)")
plot(Palm_beach$Date, Palm_beach$Enterococci..cfu.100ml., col = "green", main = "Palm Beach vs Shelly Beach", xlab = "Dates", ylab = "Enterococci (cfu)")
points(Shelly_beach$Date, Shelly_beach$Enterococci..cfu.100ml., col = "red")
Try this:
beaches$Date = as.Date(as.character(beaches$Date), '%d-%m-%y')
beaches$New_Date = format(beaches$Date, '%m-%d-%y')
Output:
> head(beaches[, c('Date', 'New_Date')])
Date New_Date
1 2018-01-25 01-25-18
2 2018-02-07 02-07-18
3 2018-02-19 02-19-18
4 2018-01-19 01-19-18
5 2018-02-01 02-01-18
6 2018-03-08 03-08-18
Since neither input nor output are dates it might make more sense to just use regular expresions, rather than converting to and from dates:
beaches$New_Date <- sub("(\\d+)-(\\d+)-(\\d+)", "\\2-\\1-\\3", beaches$Date)
#### OUTPUT ####
Date New_Date
1 25-01-18 01-25-18
2 07-02-18 02-07-18
3 19-02-18 02-19-18
4 19-01-18 01-19-18
5 01-02-18 02-01-18
6 08-03-18 03-08-18
first of all you have to make sure that the original Date column is in character format.
In your data it is in factor format. Then you first have to convert the Date column to a date format and then you can create the New_Date column:
df$Date <- as.Date(as.character(df$Date), format = "%d-%m-%y")
df$New_Date <- format(df$Date, "%m-%d-%Y")
If you only want the last two digits of the year column you can use this instead:
df$New_Date2 <- format(df$Date, "%m-%d-%y")

R - Populate one data frame with values from another dataframe, based on row matching

I'm trying to replace values in myDF1 from myDF2, where rows match for column "studyno" but the solutions I have found so far don't seem to be giving me the desired output.
Below are the data.frames:
myDF1 <- structure(list(studyno = c("J1000/9", "J1000/9", "J1000/9", "J1000/9",
"J1000/9", "J1000/9"), date = structure(c(17123, 17127, 17135,
17144, 17148, 17155), class = "Date"), pf_mcl = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_
), year = c(2016, 2016, 2016, 2016, 2016, 2016)), .Names = c("studyno",
"date", "pf_mcl", "year"), row.names = c(NA, 6L), class = "data.frame")
myDF2 <- structure(list(studyno = c("J740/4", "J1000/9", "J895/7", "J931/6",
"J609/1", "J941/3"), pf_mcl = c(0L, 0L, 0L, 0L, 0L, 0L)), .Names = c("studyno",
"pf_mcl"), row.names = c(NA, 6L), class = "data.frame")
One solution I tried that seemed to work is shown below, however, I find that whatever values were in myDF1 before have been removed.
myDF1$pf_mcl <- myDF2$pf_mcl[match(myDF1$studyno, myDF2$studyno)]
# Merge myDF1 & myDF2 by the "studyno", keeping all the rows in myDF1
agg_df = merge(myDF1, myDF2, "studyno", all.x=TRUE)
# Populate pf_mcl in the merged dataframe by using pf_mcl in myDF2 if it is available. Otherwise, use pf_mcl from myDF1
# is missing in myDF1
agg_df$pf_mcl = ifelse(is.na(agg_df$pf_mcl.y), agg_df$pf_mcl.x, agg_df$pf_mcl.y)
myDF1 = agg_df[, names(myDF1)]

Resources