Related
I am working with a time-series cross-country dataset covering the period from 2003 to 2018. Each entry in the database corresponds to a protest event, the number of participants, level of engagement of the security services, and level of participant violence. I have multiple observations per year per country. I want to create a new df that counts the number of protests for each country (Count), the average number of participants (AvgParticipants), the average security services engagement (AvgSecurity), and the average level of participant violence (AvgPartViolence). Here is the code I have written thus far:
# Creating Yearly Protest Count Data
# Load packages
library(dplyr)
# Set working directory
setwd("~/Desktop/Cooptation and Protest")
# Load data
dat <- read.csv("reports.csv")
# Subset to relevant variables
dat <- dat %>%
select(cowcode, event_date, side, scope, part_violence, sec_engagement,
numparticipants)
# Convert event_date to only year
dat$event_date <- as.Date(dat$event_date)
dat$year <- as.numeric(format(dat$event_date,'%Y'))
my_summary_data <- dat %>%
group_by(year, cowcode) %>%
summarise(Count = n()) %>%
summarise(AvgSecurity = mean(sec_engagement)) %>%
summarise(AvgPartviolence = mean(part_violence))
I have no issue when I run summarise(Count = n()), but I can't get running summarise(AvgSecurity = mean(sec_engagement)) and summarise(AvgPartviolence = mean(part_violence)) to work. Any advice would be appreciated. Below are some data for your convenience.
structure(list(cowcode = c(40L, 40L, 40L, 40L, 40L, 40L), event_date = structure(c(12183,
15302, 12173, 12173, 12393, 12583), class = "Date"), side = c(0L,
1L, 0L, 0L, 0L, 0L), scope = c(0L, 0L, 0L, 0L, 0L, 0L), part_violence = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_
), sec_engagement = c(NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_), numparticipants = c("",
"", "", "", "2000", ""), year = c(2003, 2011, 2003, 2003, 2003,
2004)), row.names = c(NA, 6L), class = "data.frame")
The comment has it!
library(tidyverse)
dat <- structure(list(cowcode = c(40L, 40L, 40L, 40L, 40L, 40L), event_date = structure(c(12183,
15302, 12173, 12173, 12393, 12583), class = "Date"), side = c(0L,
1L, 0L, 0L, 0L, 0L), scope = c(0L, 0L, 0L, 0L, 0L, 0L), part_violence = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_
), sec_engagement = c(NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_), numparticipants = c("",
"", "", "", "2000", ""), year = c(2003, 2011, 2003, 2003, 2003,
2004)), row.names = c(NA, 6L), class = "data.frame")
dat$event_date <- as.Date(dat$event_date)
dat$year <- as.numeric(format(dat$event_date,'%Y'))
my_summary_data <- dat %>%
group_by(year, cowcode) %>%
summarise(Count = n(),
AvgSecurity = mean(sec_engagement),
AvgPartviolence = mean(part_violence))
my_summary_data
I am tweaking on a way to calculate and save charts in a certain company style. I found all things I need with the custom options of ggplot but am still stuck to build this into a loop. I have roughly 110 Columns/Variables to apply this to.
My data:
test <- structure(list(Intensitaet_Wareneingang = structure(c(Intensitaet_Wareneingang = NA_integer_,
Intensitaet_Wareneingang = NA_integer_, Intensitaet_Wareneingang = NA_integer_,
Intensitaet_Wareneingang = NA_integer_, Intensitaet_Wareneingang = NA_integer_,
Intensitaet_Wareneingang = NA_integer_, Intensitaet_Wareneingang = NA_integer_,
Intensitaet_Wareneingang = NA_integer_, Intensitaet_Wareneingang = NA_integer_,
Intensitaet_Wareneingang = NA_integer_, Intensitaet_Wareneingang = NA_integer_,
Intensitaet_Wareneingang = NA_integer_, Intensitaet_Wareneingang = NA_integer_,
Intensitaet_Wareneingang = NA_integer_, Intensitaet_Wareneingang = NA_integer_,
Intensitaet_Wareneingang = NA_integer_, Intensitaet_Wareneingang = NA_integer_,
Intensitaet_Wareneingang = NA_integer_, Intensitaet_Wareneingang = NA_integer_,
Intensitaet_Wareneingang = NA_integer_, Intensitaet_Wareneingang = NA_integer_
), .Label = c("sehr gering", "gering", "mittel", "hoch", "sehr hoch",
"keine Angabe", "NA"), class = "factor"), Zufriedenheit_Wareneingang = c(NA,
"keine Angabe", "mittel", "hoch", "hoch", "mittel", "hoch", NA,
"sehr hoch", "keine Angabe", NA, "keine Angabe", "keine Angabe",
"keine Angabe", "hoch", "hoch", "mittel", "mittel", NA, "mittel",
NA), Intensitaet_Einlagerung = c(NA, "mittel", "gering", "hoch",
"hoch", "gering", "sehr hoch", NA, "sehr hoch", "mittel", NA,
"sehr gering", "sehr gering", "sehr gering", "gering", "sehr hoch",
"sehr hoch", "mittel", NA, "hoch", NA), Zufriedenheit_Einlagerung = structure(c(Zufriedenheit_Einlagerung = NA_integer_,
Zufriedenheit_Einlagerung = NA_integer_, Zufriedenheit_Einlagerung = NA_integer_,
Zufriedenheit_Einlagerung = NA_integer_, Zufriedenheit_Einlagerung = NA_integer_,
Zufriedenheit_Einlagerung = NA_integer_, Zufriedenheit_Einlagerung = NA_integer_,
Zufriedenheit_Einlagerung = NA_integer_, Zufriedenheit_Einlagerung = NA_integer_,
Zufriedenheit_Einlagerung = NA_integer_, Zufriedenheit_Einlagerung = NA_integer_,
Zufriedenheit_Einlagerung = NA_integer_, Zufriedenheit_Einlagerung = NA_integer_,
Zufriedenheit_Einlagerung = NA_integer_, Zufriedenheit_Einlagerung = NA_integer_,
Zufriedenheit_Einlagerung = NA_integer_, Zufriedenheit_Einlagerung = NA_integer_,
Zufriedenheit_Einlagerung = NA_integer_, Zufriedenheit_Einlagerung = NA_integer_,
Zufriedenheit_Einlagerung = NA_integer_, Zufriedenheit_Einlagerung = NA_integer_
), .Label = c("sehr gering", "gering", "mittel", "hoch", "sehr hoch",
"keine Angabe", "NA"), class = "factor")), row.names = c(NA,
-21L), class = c("tbl_df", "tbl", "data.frame"))
My goal:
sort values that are valid, exclude missings and those who choose not to reply in this column
plot only the values, choose colour and and the number of displayed cases
additionally I want to include a write comand so that I will have 110 bar charts in my working directory
Where I am stuck
for (i in 1:4) {
test[,c(i)] <- factor((test[ ,c(i)]), levels = c("sehr gering" , "gering", "mittel", "hoch", "sehr hoch", "keine Angabe", "NA"))
print(ggplot(data=subset.data.frame((test[ ,c(i)]) %in% c("sehr gering" , "gering", "mittel", "hoch", "sehr hoch")), aes(x=(test[ ,c(i)])))) +
geom_bar(fill = "cornflowerblue",
color="black") +
geom_text(aes(label=stat(count)), stat = "count", vjust=-.75) +
labs(subtitle = paste("n gesamt: ", nrow(subset(test[ ,c(i)] %in% c("sehr gering" , "gering", "mittel", "hoch", "sehr hoch"))), y = "Häufigkeit")) +
scale_y_continuous(labels = scales::percent) }
This produces the error message:
error in rep_len(TRUE, nrow(x)) : invalid 'length.out' value.
Using ggplot (without print) and naming one variable is working. Any suggestions on how I could finally get this loop running is highly appreciated. I did not found sth. similar in the forum yet.
Thanks a lot
Try using this code -
In for loop I pass column names instead of number.
Used [[]] to subset the respective columns and change them to factor.
Create subset_data as separate dataframe to make it easier to reuse.
Use .data to refer to columns in aes.
library(ggplot2)
for (i in colnames(test)) {
test[[i]] <- factor(test[[i]], levels = c("sehr gering" , "gering", "mittel", "hoch", "sehr hoch", "keine Angabe", "NA"))
subset_data <- subset(test, test[[i]] %in% c("sehr gering" , "gering", "mittel", "hoch", "sehr hoch"))
print(ggplot(data=subset_data, aes(x= .data[[i]])) +
geom_bar(fill = "cornflowerblue",
color="black") +
geom_text(aes(label=stat(count)), stat = "count", vjust=-.75) +
labs(subtitle = paste("n gesamt: ", nrow(subset_data), y = "Häufigkeit")) +
scale_y_continuous(labels = scales::percent))
}
I have this data and I want to make a new column:
structure(list(AGE_GROUP = c("21-30", "31-40", "41-50"), DATE = c("12/17/2020",
"12/17/2020", "12/17/2020"), VACCINE_COUNT = c(36L, 47L, 26L),
PERC_TOTAL_VACC = c(24.82758621, 32.4137931, 17.93103448),
RECIPIENT_COUNT = c(NA_integer_, NA_integer_, NA_integer_
), PERC_TOTAL_RECIP = c(NA_real_, NA_real_, NA_real_), RECIP_FULLY_VACC = c(NA_integer_,
NA_integer_, NA_integer_), PERC_FULLY_VACC = c(NA_real_,
NA_real_, NA_real_)), row.names = c(NA, 3L), class = "data.frame")
based on age group I want to make a column that includes this numbers c(8, 12,13,16,14,12), and repeat this column 3 times. So the outcome is a new column that 3times have the mentioned numbers.
I have used this code vaccine<-vaccine %>% mutate(new_col = rep(list(vals), n())) %>% unnest()
and I have something like this
"12/18/2020", "12/18/2020"), VACCINE_COUNT = c(421L, 421L, 421L
), PERC_TOTAL_VACC = c(15.52932497, 15.52932497, 15.52932497),
RECIPIENT_COUNT = c(NA_integer_, NA_integer_, NA_integer_
), PERC_TOTAL_RECIP = c(NA_real_, NA_real_, NA_real_), RECIP_FULLY_VACC = c(NA_integer_,
NA_integer_, NA_integer_), PERC_FULLY_VACC = c(NA_real_,
NA_real_, NA_real_), X = c(NA, NA, NA), X.1 = c(14L, 14L,
14L), new_col = c(8, 12, 13)), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame"))```
While I want to keep my data and just repeat the data
Do you mean to repeat the values c(8, 12,13,16,14,12) for each row in the dataframe? Try :
library(dplyr)
library(tidyr)
vals <- c(8, 12,13,16,14,12)
df %>%
mutate(new_col = rep(list(vals), n())) %>%
unnest(new_col)
Using base R
transform(df1[rep(seq_len(nrow(df1)), each = length(vals)),], new_col = vals)
Or with uncount
library(dplyr)
library(tidyr)
df1 %>%
uncount(length(vals)) %>%
mutate(new_col = rep(vals, length.out = n()))
If we need to just replicate and store the column, wrap in a list
df1 %>%
mutate(new_col = list(vals))
data
vals <- c(8, 12,13,16,14,12)
I am trying to create a user-defined function to replace missing values in each variable using specific probabilities.
I can get the probabilities to print, but the second part of the code does not seem to work and all the missing values still remain.
I do not get any particular error message so puzzled why it is not working.
My data:
library(dplyr)
library(glue)
structure(list(id = c("395891", "373742", "316241", "282072",
"341331", "251761", "154591", "125051", "095361", "141822", "281411",
"31571", "165191", "03212", "08091", "26172", "135561", "164331",
"344511", "37352"), ph201_01 = c(1L, NA, 1L, 1L, NA, 1L, 1L,
NA, NA, NA, 1L, NA, NA, 1L, NA, 1L, NA, NA, 1L, NA), ph201_02 = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_), ph201_03 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L, NA, NA, NA,
NA, NA, NA), ph201_04 = c(NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_)), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -20L))
My code:
# Create user-defined function
create_mock_vars = function(var) {
# get prevalence
prev = round(sum(mydata[[var]], na.rm=TRUE)/nrow(mydata), 2)
print(glue("Prevalence of {var} is {prev}."))
mydata[[var]][is.na(mydata[[var]])] = sample(0:1, size=sum(is.na(mydata[[var]])), replace=TRUE, prob=c(prev, 1-prev))
return(mydata)
}
# Get list of variable names I want to impute
myvarnames = names(mydata[,-1])
# Apply my function
sapply(myvarnames, create_mock_vars)
glimpse(mydata)
You can create the user-defined function as :
create_mock_vars = function(x) {
prev <- sum(x, na.rm=TRUE)/length(x)
x[is.na(x)] <- sample(0:1, size = sum(is.na(x)), replace=TRUE,
prob= c(prev, 1-prev))
return(x)
}
and use lapply to apply it to each myvarnames columns
mydata[myvarnames] <- lapply(mydata[myvarnames], create_mock_vars)
Not entirely sure what you mean so here are two Base R solutions:
df1 <- data.frame(lapply(df,
function(x) {
if (is.numeric(x) & sum(is.na(x)) == length(x)) {
NA_integer_
} else if (is.numeric(x)) {
ifelse(is.na(x), sum(x, na.rm = TRUE) / length(x), x)
} else{
ifelse(is.na(x), na.omit(x)[cumsum(!is.na(x))], x)
}
}))
df2 <- data.frame(lapply(df,
function(x) {
if (is.numeric(x)) {
ifelse(sum(is.na(x)) == length(x), 1,
ifelse(sum(is.na(x)) != length(x),
sum(x, na.rm = TRUE) / length(x), x))
} else{
ifelse(is.na(x), na.omit(x)[cumsum(!is.na(x))], x)
}
}))
Data:
df <- structure(
list(
id = c("395891", "373742", "316241", "282072",
"341331", "251761"),
ph201_01 = c(1L, NA, 1L, 1L, NA, 1L),
ph201_02 = c(
NA_integer_,
NA_integer_,
NA_integer_,
NA_integer_,
NA_integer_,
NA_integer_
),
ph201_03 = c(
NA_integer_,
NA_integer_,
NA_integer_,
NA_integer_,
NA_integer_,
NA_integer_
),
ph201_04 = c(
NA_integer_,
NA_integer_,
NA_integer_,
NA_integer_,
NA_integer_,
NA_integer_
)
),
class = c("tbl_df",
"tbl", "data.frame"),
row.names = c(NA, -6L)
)
I was able to solve this but without creating a user-defined function. Still, it would be great if someone could help me figure out what I did wrong for future reference?
myvarnames = names(mydata[,-1])
for (i in myvarnames) {
prev = round(sum(mydata[[i]], na.rm=TRUE)/nrow(mydata), 2) # prevalence of deficit
mydata[[i]][is.na(mydata[[i]])] = sample(0:1, size=sum(is.na(mydata[[i]])), replace=TRUE, prob=c(prev, 1-prev))
}
glimpse(mydata)
I am trying to run a for loop for the below scenario
I have 100k Unique customers in my transactional table which are going to n number of stores each customer.
I am trying to loop through each customer and seeing in which unique store they are going and if new store has opened within 20 kms range to that store then he/she will go to that store and I will make their Value 1 in new data frame created.
I am initializing my code in first for loop and then repeating the same for rest of my data frame.
My code is extremely slow with for loop. I don't know how to vectorise my code. Below is the snapshot of my code. Please guide me how to make this code faster and efficient.
cust_id = c(unique(kk$Customer_ID))
i = cust_id[1]
# for initializing
s = c(0,0,0,0)
df_temp = kk[kk$Customer_ID == i]
store = c(unique(df_temp$Store_Code))
system.time(
for (j in store){
if(df_temp[Store_Code == j]$dist.km298 < 20) {
s[1] <- 1
}
if(df_temp[Store_Code == j]$dist.km299 < 20) {
s[2] <- 1
}
if(df_temp[Store_Code == j]$dist.km300 < 20) {
s[3] <- 1
}
if(df_temp[Store_Code == j]$dist.km301 < 20) {
s[4] <- 1
}
}
)
vishal <- data.table("Customer_ID" = c(i,i,i,i) , "Store_Code" =
c(60298,60299,60300,60301), "Prediction" = s)
cust_id <- cust_id[!cust_id %in% c(cust_id[1])]
# loop for all customers
count = 1
system.time(for (k in 1:length(cust_id)){
i <- cust_id[k]
# count <- count+1
# if (count == 5) {
# break
#}
s = c(0,0,0,0)
df_temp = kk[kk$Customer_ID == i]
store = c(unique(df_temp$Store_Code))
for (j in store){
#if(df_temp$Store_Code == j & df_temp$Purchase_2016 != 0 &
df_temp$Purchase_2017 == 0){
if(df_temp[Store_Code == j]$dist.km298 < 20) {
s[1] <- 1
}
if(df_temp[Store_Code == j]$dist.km299 < 20) {
s[2] <- 1
}
if(df_temp[Store_Code == j]$dist.km300 < 20) {
s[3] <- 1
}
if(df_temp[Store_Code == j]$dist.km301 < 20) {
s[4] <- 1
}
}
v_temp <- data.table("Customer_ID" = c(i,i,i,i) , "Store_Code" =
c(60298,60299,60300,60301), "Prediction" = s)
l = list(vishal,v_temp)
vishal <- rbindlist(l)
}
)
dput(head(kk, 5))
structure(list(Customer_ID =
structure(c(1800000006365760, 1800000006365820,1800000006366060
,1800000006366060,1800000006366060), class = "integer64"), Store_Code =
c(60067, 60054, 60066,
60069, 60079), Purchase_2016 = c(2L, 1L, 1L, 1L, 2L), Purchase_2017 =
c(2L,
0L, 0L, 0L, 0L), TotalPurchases = c(4L, 1L, 1L, 1L, 2L), Return_2016 =
c(0L,
0L, 0L, 0L, 0L), Return_2017 = c(0L, 0L, 0L, 0L, 0L), Return_2010 = c(0L,
0L, 0L, 0L, 0L), Rp_Ratio_2016 = c(0, 0, 0, 0, 0), Rp_Ratio_2017 = c(0,
0, 0, 0, 0), Sales_Per_Day = c(1699.6, 2101.1, 1331.4, 1813.1,
1193.1), Store_Launch_Date = structure(c(1323820800, 1322006400,
1338163200, 1311984000, 1385164800), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), Store_Size_Sq_Ft = c(8673.5, 12425.5, 15897.1,
6698.1, 3699.5), Customer_Count = c(89351, 118444, 79249, 114246,
54832), Total_Revenue = c(35350868.4, 43702303, 27693164.7, 37712369.7,
24816886.2), dist.km298 = c(140.24, 123.87, 10.2, 131.96, 128.52
), dist.km299 = c(163.37, 140.2, 79.32, 153.01, 145.03), dist.km300 =
c(4.09,
21.05, 126.55, 7.03, 17.41), dist.km301 = c(5.72, 19.04, 125.46,
5.02, 15.4), Nationality = c("INDIA", "UNITED ARAB EMIRATES",
"SRI LANKA", "SRI LANKA", "SRI LANKA"), Gender = c("M", "F",
"M", "M", "M"), Marital_Status = c("Married", "Married", "Married",
"Married", "Married"), Loyalty_Status = c("Gold", "Silver", "Silver",
"Silver", "Silver"), Points = c(814L, 212L, 186L, 186L, 186L),
Age = c(59L, 119L, 59L, 59L, 59L), LastVisit = c(2, 28, 3,
3, 3), Last_rdm_txn_dt1 = structure(c(17601, 16510, 17196,
17196, 17196), class = "Date"), Last_accr_txn_dt1 = structure(c(17801,
17029, 17774, 17774, 17774), class = "Date")), .Names = c("Customer_ID",
"Store_Code", "Purchase_2016", "Purchase_2017", "TotalPurchases",
"Return_2016", "Return_2017", "Return_2010", "Rp_Ratio_2016",
"Rp_Ratio_2017", "Sales_Per_Day", "Store_Launch_Date",
"Store_Size_Sq_Ft",
"Customer_Count", "Total_Revenue", "dist.km298", "dist.km299",
"dist.km300", "dist.km301", "Nationality", "Gender", "Marital_Status",
"Loyalty_Status", "Points", "Age", "LastVisit", "Last_rdm_txn_dt1",
"Last_accr_txn_dt1"), sorted = "Customer_ID", class = c("data.table",
"data.frame"), row.names = c(NA, -5L), .internal.selfref = <pointer:
0x0000000004810788>)
Please read the guide to posting good questions on StackOverflow, it will allow people to answer your questions easily. You question is very confusing but this might give you some ideas:
data %>%
group_by(Customer_ID, Store_Code) %>%
mutate(Predition298 = ifelse(dist.km298 > 20, 1, 0),
Predition299 = ifelse(dist.km299 > 20, 1, 0),
Predition300 = ifelse(dist.km300 > 20, 1, 0))