I am trying to create a user-defined function to replace missing values in each variable using specific probabilities.
I can get the probabilities to print, but the second part of the code does not seem to work and all the missing values still remain.
I do not get any particular error message so puzzled why it is not working.
My data:
library(dplyr)
library(glue)
structure(list(id = c("395891", "373742", "316241", "282072",
"341331", "251761", "154591", "125051", "095361", "141822", "281411",
"31571", "165191", "03212", "08091", "26172", "135561", "164331",
"344511", "37352"), ph201_01 = c(1L, NA, 1L, 1L, NA, 1L, 1L,
NA, NA, NA, 1L, NA, NA, 1L, NA, 1L, NA, NA, 1L, NA), ph201_02 = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_), ph201_03 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L, NA, NA, NA,
NA, NA, NA), ph201_04 = c(NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_)), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -20L))
My code:
# Create user-defined function
create_mock_vars = function(var) {
# get prevalence
prev = round(sum(mydata[[var]], na.rm=TRUE)/nrow(mydata), 2)
print(glue("Prevalence of {var} is {prev}."))
mydata[[var]][is.na(mydata[[var]])] = sample(0:1, size=sum(is.na(mydata[[var]])), replace=TRUE, prob=c(prev, 1-prev))
return(mydata)
}
# Get list of variable names I want to impute
myvarnames = names(mydata[,-1])
# Apply my function
sapply(myvarnames, create_mock_vars)
glimpse(mydata)
You can create the user-defined function as :
create_mock_vars = function(x) {
prev <- sum(x, na.rm=TRUE)/length(x)
x[is.na(x)] <- sample(0:1, size = sum(is.na(x)), replace=TRUE,
prob= c(prev, 1-prev))
return(x)
}
and use lapply to apply it to each myvarnames columns
mydata[myvarnames] <- lapply(mydata[myvarnames], create_mock_vars)
Not entirely sure what you mean so here are two Base R solutions:
df1 <- data.frame(lapply(df,
function(x) {
if (is.numeric(x) & sum(is.na(x)) == length(x)) {
NA_integer_
} else if (is.numeric(x)) {
ifelse(is.na(x), sum(x, na.rm = TRUE) / length(x), x)
} else{
ifelse(is.na(x), na.omit(x)[cumsum(!is.na(x))], x)
}
}))
df2 <- data.frame(lapply(df,
function(x) {
if (is.numeric(x)) {
ifelse(sum(is.na(x)) == length(x), 1,
ifelse(sum(is.na(x)) != length(x),
sum(x, na.rm = TRUE) / length(x), x))
} else{
ifelse(is.na(x), na.omit(x)[cumsum(!is.na(x))], x)
}
}))
Data:
df <- structure(
list(
id = c("395891", "373742", "316241", "282072",
"341331", "251761"),
ph201_01 = c(1L, NA, 1L, 1L, NA, 1L),
ph201_02 = c(
NA_integer_,
NA_integer_,
NA_integer_,
NA_integer_,
NA_integer_,
NA_integer_
),
ph201_03 = c(
NA_integer_,
NA_integer_,
NA_integer_,
NA_integer_,
NA_integer_,
NA_integer_
),
ph201_04 = c(
NA_integer_,
NA_integer_,
NA_integer_,
NA_integer_,
NA_integer_,
NA_integer_
)
),
class = c("tbl_df",
"tbl", "data.frame"),
row.names = c(NA, -6L)
)
I was able to solve this but without creating a user-defined function. Still, it would be great if someone could help me figure out what I did wrong for future reference?
myvarnames = names(mydata[,-1])
for (i in myvarnames) {
prev = round(sum(mydata[[i]], na.rm=TRUE)/nrow(mydata), 2) # prevalence of deficit
mydata[[i]][is.na(mydata[[i]])] = sample(0:1, size=sum(is.na(mydata[[i]])), replace=TRUE, prob=c(prev, 1-prev))
}
glimpse(mydata)
Related
I am working with a time-series cross-country dataset covering the period from 2003 to 2018. Each entry in the database corresponds to a protest event, the number of participants, level of engagement of the security services, and level of participant violence. I have multiple observations per year per country. I want to create a new df that counts the number of protests for each country (Count), the average number of participants (AvgParticipants), the average security services engagement (AvgSecurity), and the average level of participant violence (AvgPartViolence). Here is the code I have written thus far:
# Creating Yearly Protest Count Data
# Load packages
library(dplyr)
# Set working directory
setwd("~/Desktop/Cooptation and Protest")
# Load data
dat <- read.csv("reports.csv")
# Subset to relevant variables
dat <- dat %>%
select(cowcode, event_date, side, scope, part_violence, sec_engagement,
numparticipants)
# Convert event_date to only year
dat$event_date <- as.Date(dat$event_date)
dat$year <- as.numeric(format(dat$event_date,'%Y'))
my_summary_data <- dat %>%
group_by(year, cowcode) %>%
summarise(Count = n()) %>%
summarise(AvgSecurity = mean(sec_engagement)) %>%
summarise(AvgPartviolence = mean(part_violence))
I have no issue when I run summarise(Count = n()), but I can't get running summarise(AvgSecurity = mean(sec_engagement)) and summarise(AvgPartviolence = mean(part_violence)) to work. Any advice would be appreciated. Below are some data for your convenience.
structure(list(cowcode = c(40L, 40L, 40L, 40L, 40L, 40L), event_date = structure(c(12183,
15302, 12173, 12173, 12393, 12583), class = "Date"), side = c(0L,
1L, 0L, 0L, 0L, 0L), scope = c(0L, 0L, 0L, 0L, 0L, 0L), part_violence = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_
), sec_engagement = c(NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_), numparticipants = c("",
"", "", "", "2000", ""), year = c(2003, 2011, 2003, 2003, 2003,
2004)), row.names = c(NA, 6L), class = "data.frame")
The comment has it!
library(tidyverse)
dat <- structure(list(cowcode = c(40L, 40L, 40L, 40L, 40L, 40L), event_date = structure(c(12183,
15302, 12173, 12173, 12393, 12583), class = "Date"), side = c(0L,
1L, 0L, 0L, 0L, 0L), scope = c(0L, 0L, 0L, 0L, 0L, 0L), part_violence = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_
), sec_engagement = c(NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_), numparticipants = c("",
"", "", "", "2000", ""), year = c(2003, 2011, 2003, 2003, 2003,
2004)), row.names = c(NA, 6L), class = "data.frame")
dat$event_date <- as.Date(dat$event_date)
dat$year <- as.numeric(format(dat$event_date,'%Y'))
my_summary_data <- dat %>%
group_by(year, cowcode) %>%
summarise(Count = n(),
AvgSecurity = mean(sec_engagement),
AvgPartviolence = mean(part_violence))
my_summary_data
I am tweaking on a way to calculate and save charts in a certain company style. I found all things I need with the custom options of ggplot but am still stuck to build this into a loop. I have roughly 110 Columns/Variables to apply this to.
My data:
test <- structure(list(Intensitaet_Wareneingang = structure(c(Intensitaet_Wareneingang = NA_integer_,
Intensitaet_Wareneingang = NA_integer_, Intensitaet_Wareneingang = NA_integer_,
Intensitaet_Wareneingang = NA_integer_, Intensitaet_Wareneingang = NA_integer_,
Intensitaet_Wareneingang = NA_integer_, Intensitaet_Wareneingang = NA_integer_,
Intensitaet_Wareneingang = NA_integer_, Intensitaet_Wareneingang = NA_integer_,
Intensitaet_Wareneingang = NA_integer_, Intensitaet_Wareneingang = NA_integer_,
Intensitaet_Wareneingang = NA_integer_, Intensitaet_Wareneingang = NA_integer_,
Intensitaet_Wareneingang = NA_integer_, Intensitaet_Wareneingang = NA_integer_,
Intensitaet_Wareneingang = NA_integer_, Intensitaet_Wareneingang = NA_integer_,
Intensitaet_Wareneingang = NA_integer_, Intensitaet_Wareneingang = NA_integer_,
Intensitaet_Wareneingang = NA_integer_, Intensitaet_Wareneingang = NA_integer_
), .Label = c("sehr gering", "gering", "mittel", "hoch", "sehr hoch",
"keine Angabe", "NA"), class = "factor"), Zufriedenheit_Wareneingang = c(NA,
"keine Angabe", "mittel", "hoch", "hoch", "mittel", "hoch", NA,
"sehr hoch", "keine Angabe", NA, "keine Angabe", "keine Angabe",
"keine Angabe", "hoch", "hoch", "mittel", "mittel", NA, "mittel",
NA), Intensitaet_Einlagerung = c(NA, "mittel", "gering", "hoch",
"hoch", "gering", "sehr hoch", NA, "sehr hoch", "mittel", NA,
"sehr gering", "sehr gering", "sehr gering", "gering", "sehr hoch",
"sehr hoch", "mittel", NA, "hoch", NA), Zufriedenheit_Einlagerung = structure(c(Zufriedenheit_Einlagerung = NA_integer_,
Zufriedenheit_Einlagerung = NA_integer_, Zufriedenheit_Einlagerung = NA_integer_,
Zufriedenheit_Einlagerung = NA_integer_, Zufriedenheit_Einlagerung = NA_integer_,
Zufriedenheit_Einlagerung = NA_integer_, Zufriedenheit_Einlagerung = NA_integer_,
Zufriedenheit_Einlagerung = NA_integer_, Zufriedenheit_Einlagerung = NA_integer_,
Zufriedenheit_Einlagerung = NA_integer_, Zufriedenheit_Einlagerung = NA_integer_,
Zufriedenheit_Einlagerung = NA_integer_, Zufriedenheit_Einlagerung = NA_integer_,
Zufriedenheit_Einlagerung = NA_integer_, Zufriedenheit_Einlagerung = NA_integer_,
Zufriedenheit_Einlagerung = NA_integer_, Zufriedenheit_Einlagerung = NA_integer_,
Zufriedenheit_Einlagerung = NA_integer_, Zufriedenheit_Einlagerung = NA_integer_,
Zufriedenheit_Einlagerung = NA_integer_, Zufriedenheit_Einlagerung = NA_integer_
), .Label = c("sehr gering", "gering", "mittel", "hoch", "sehr hoch",
"keine Angabe", "NA"), class = "factor")), row.names = c(NA,
-21L), class = c("tbl_df", "tbl", "data.frame"))
My goal:
sort values that are valid, exclude missings and those who choose not to reply in this column
plot only the values, choose colour and and the number of displayed cases
additionally I want to include a write comand so that I will have 110 bar charts in my working directory
Where I am stuck
for (i in 1:4) {
test[,c(i)] <- factor((test[ ,c(i)]), levels = c("sehr gering" , "gering", "mittel", "hoch", "sehr hoch", "keine Angabe", "NA"))
print(ggplot(data=subset.data.frame((test[ ,c(i)]) %in% c("sehr gering" , "gering", "mittel", "hoch", "sehr hoch")), aes(x=(test[ ,c(i)])))) +
geom_bar(fill = "cornflowerblue",
color="black") +
geom_text(aes(label=stat(count)), stat = "count", vjust=-.75) +
labs(subtitle = paste("n gesamt: ", nrow(subset(test[ ,c(i)] %in% c("sehr gering" , "gering", "mittel", "hoch", "sehr hoch"))), y = "Häufigkeit")) +
scale_y_continuous(labels = scales::percent) }
This produces the error message:
error in rep_len(TRUE, nrow(x)) : invalid 'length.out' value.
Using ggplot (without print) and naming one variable is working. Any suggestions on how I could finally get this loop running is highly appreciated. I did not found sth. similar in the forum yet.
Thanks a lot
Try using this code -
In for loop I pass column names instead of number.
Used [[]] to subset the respective columns and change them to factor.
Create subset_data as separate dataframe to make it easier to reuse.
Use .data to refer to columns in aes.
library(ggplot2)
for (i in colnames(test)) {
test[[i]] <- factor(test[[i]], levels = c("sehr gering" , "gering", "mittel", "hoch", "sehr hoch", "keine Angabe", "NA"))
subset_data <- subset(test, test[[i]] %in% c("sehr gering" , "gering", "mittel", "hoch", "sehr hoch"))
print(ggplot(data=subset_data, aes(x= .data[[i]])) +
geom_bar(fill = "cornflowerblue",
color="black") +
geom_text(aes(label=stat(count)), stat = "count", vjust=-.75) +
labs(subtitle = paste("n gesamt: ", nrow(subset_data), y = "Häufigkeit")) +
scale_y_continuous(labels = scales::percent))
}
I have this data and I want to make a new column:
structure(list(AGE_GROUP = c("21-30", "31-40", "41-50"), DATE = c("12/17/2020",
"12/17/2020", "12/17/2020"), VACCINE_COUNT = c(36L, 47L, 26L),
PERC_TOTAL_VACC = c(24.82758621, 32.4137931, 17.93103448),
RECIPIENT_COUNT = c(NA_integer_, NA_integer_, NA_integer_
), PERC_TOTAL_RECIP = c(NA_real_, NA_real_, NA_real_), RECIP_FULLY_VACC = c(NA_integer_,
NA_integer_, NA_integer_), PERC_FULLY_VACC = c(NA_real_,
NA_real_, NA_real_)), row.names = c(NA, 3L), class = "data.frame")
based on age group I want to make a column that includes this numbers c(8, 12,13,16,14,12), and repeat this column 3 times. So the outcome is a new column that 3times have the mentioned numbers.
I have used this code vaccine<-vaccine %>% mutate(new_col = rep(list(vals), n())) %>% unnest()
and I have something like this
"12/18/2020", "12/18/2020"), VACCINE_COUNT = c(421L, 421L, 421L
), PERC_TOTAL_VACC = c(15.52932497, 15.52932497, 15.52932497),
RECIPIENT_COUNT = c(NA_integer_, NA_integer_, NA_integer_
), PERC_TOTAL_RECIP = c(NA_real_, NA_real_, NA_real_), RECIP_FULLY_VACC = c(NA_integer_,
NA_integer_, NA_integer_), PERC_FULLY_VACC = c(NA_real_,
NA_real_, NA_real_), X = c(NA, NA, NA), X.1 = c(14L, 14L,
14L), new_col = c(8, 12, 13)), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame"))```
While I want to keep my data and just repeat the data
Do you mean to repeat the values c(8, 12,13,16,14,12) for each row in the dataframe? Try :
library(dplyr)
library(tidyr)
vals <- c(8, 12,13,16,14,12)
df %>%
mutate(new_col = rep(list(vals), n())) %>%
unnest(new_col)
Using base R
transform(df1[rep(seq_len(nrow(df1)), each = length(vals)),], new_col = vals)
Or with uncount
library(dplyr)
library(tidyr)
df1 %>%
uncount(length(vals)) %>%
mutate(new_col = rep(vals, length.out = n()))
If we need to just replicate and store the column, wrap in a list
df1 %>%
mutate(new_col = list(vals))
data
vals <- c(8, 12,13,16,14,12)
DATA
df <- structure(list(ID = c("51-07519", "51-07522", "51-07525", "51-07526",
"51-07527", "51-07530"), name = c("Fyb", "Fyb", "Fyb", "Fyb",
"Fyb", "Fyb"), serology_charts = c(0L, 0L, NA, 0L, 1L, 1L), antibodies_chart = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_
), bioarray_charts = c(NA, 0L, NA, 0L, NA, NA), others_charts = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_
), Fyb = c(1, 1, 1, 1, 1, 1), GATAfactor = c(0, 0, 1, 0, 0.5,
0.5)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"
))
I currently run the following filter:
df%>%
filter_at(vars(ends_with("charts")), any_vars(!is.na(.) & . != Fyb*GATAfactor))
Is it possible to write an if statement as follows:
if Fyb!=1 {filter_at(vars(ends_with("charts")), any_vars(!is.na(.) & . != Fyb))}
else {filter_at(vars(ends_with("charts")), any_vars(!is.na(.) & . != Fyb*GATAfactor))}
We can wrap the condition in a case_when or ifelse
library(dplyr)
df %>%
filter_at(vars(ends_with("charts")),
any_vars(case_when(Fyb == 1 ~ !is.na(.) & . != Fyb*GATAfactor,
TRUE ~ !is.na(.) & . != Fyb)))
Or using ifelse
df %>%
filter_at(vars(ends_with("charts")),
any_vars(ifelse(Fyb == 1, !is.na(.) & . != Fyb*GATAfactor, !is.na(.) & . != Fyb)))
I'm trying to replace values in myDF1 from myDF2, where rows match for column "studyno" but the solutions I have found so far don't seem to be giving me the desired output.
Below are the data.frames:
myDF1 <- structure(list(studyno = c("J1000/9", "J1000/9", "J1000/9", "J1000/9",
"J1000/9", "J1000/9"), date = structure(c(17123, 17127, 17135,
17144, 17148, 17155), class = "Date"), pf_mcl = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_
), year = c(2016, 2016, 2016, 2016, 2016, 2016)), .Names = c("studyno",
"date", "pf_mcl", "year"), row.names = c(NA, 6L), class = "data.frame")
myDF2 <- structure(list(studyno = c("J740/4", "J1000/9", "J895/7", "J931/6",
"J609/1", "J941/3"), pf_mcl = c(0L, 0L, 0L, 0L, 0L, 0L)), .Names = c("studyno",
"pf_mcl"), row.names = c(NA, 6L), class = "data.frame")
One solution I tried that seemed to work is shown below, however, I find that whatever values were in myDF1 before have been removed.
myDF1$pf_mcl <- myDF2$pf_mcl[match(myDF1$studyno, myDF2$studyno)]
# Merge myDF1 & myDF2 by the "studyno", keeping all the rows in myDF1
agg_df = merge(myDF1, myDF2, "studyno", all.x=TRUE)
# Populate pf_mcl in the merged dataframe by using pf_mcl in myDF2 if it is available. Otherwise, use pf_mcl from myDF1
# is missing in myDF1
agg_df$pf_mcl = ifelse(is.na(agg_df$pf_mcl.y), agg_df$pf_mcl.x, agg_df$pf_mcl.y)
myDF1 = agg_df[, names(myDF1)]