Editing loop to append output by variable

Editing loop to append output by variable - r

The script below is used to create a loop, create certain outputs and separately export them. I would like to modify the script to append the variables of interest by their stratifiers - i.e. combine health_mental, overall_health and outlook_life outputs when they're stratified by age, and another output when they're stratified by sex. I would like to then export those files out in excel format. Thanks in advance for the help!
library(tidyverse)
library(dplyr)
df <- data.frame (overall_health = c("poor", "good", "excellent", "poor", "good", "poor", "poor", "excellent"),
outlook_life = c("good", "excellent", "excellent", "poor", "excellent", "poor", "excellent", "poor"),
health_mental = c("poor", "poor", "excellent", "poor", "poor", "poor", "excellent", "good"),
sex = c("F", "M", "M", "F", "F", "M", "F", "M"),
age_group = c("50-54", "60-64", "80+", "70-74", "40-44", "45-49", "60-64", "65-69"),
income = c("$<40,000", "$50,000-79,000", "$80,000-110,000", "$111,000+", "$<40,000", "$<40,000", "$50,000-79,000", "$80,000-110,000"),
education = c("HS", "College", "Bachelors", "Masters", "HS", "College", "Bachelors", "Masters"),
geography= c("area1", "area2", "area1", "area2", "area2", "area1", "area2", "area1"))
geos <- unique(df$geography)
vars <- c("health_mental", "overall_health", "outlook_life")
combinations <- expand.grid(c("age_group", "sex"), vars, stringsAsFactors = F)
combinations$label <- paste(combinations$Var1, combinations$Var2, sep = "_")
output <- list()
for (geo in geos){
for (combo in 1:nrow(combinations)){
output_label <- paste(combinations[[combo,"label"]],geo,sep="_")
temp <- df %>%
filter(geography == geo) %>%
group_by_at(combinations[combo,1:2] %>% unlist()%>%unname()) %>%
summarise(count = n(),
total = nrow(.),
proportion = count/nrow(.) *100)
output[[output_label]] <- temp
}
}
Expected output - example

Related

Free axis along particular facets in facet_grid ggplot

I am trying to do 2 things:
First and most important is to somehow get the y axis to vary by the outcome_type2 variable, since they all have fairly different ranges. So the "C" have their own axis range, the "Z" have their own axis range, and the "SS" have their own axis range.
Then the secondary thing would be to somehow adjust the spacing of columns, so that there's a bit of space by those same groups--the 3 "C" columns would be close together, with a bit of extra white space between them and the "Z", then same between "Z" and "SS". Just to differentiate a little more between those three groups.
I tried tinkering with faceting on outcome_type2 instead of outcome_type but to no avail.
This is current base code, which technically works fine, but as you'll see, having them all use the same Y axis really swamps the "Z" and "SS" panels.
ggplot(dtest, aes(x = var2, y = avg2, fill = var2)) +
geom_bar(stat = "identity",
width = 1) +
facet_grid(wave ~ forcats::fct_relevel(outcome_type, "CT", "CI", "CE", "FZ", "MZ", "PSS", "CSS"),
scales = "free_y",
space = "free_y") +
theme_minimal() +
theme(legend.position = "none")
dtest <- structure(list(outcome_type = c("CT", "CT", "CT", "CI", "CI",
"CI", "CE", "CE", "CE", "FZ", "FZ", "MZ", "MZ", "PSS", "PSS",
"CSS", "CSS", "CT", "CT", "CT", "CI", "CI", "CI", "CE", "CE",
"CE", "FZ", "FZ", "MZ", "MZ", "PSS", "PSS", "CSS", "CSS"), wave = c("Wave 1",
"Wave 2", "Wave 3", "Wave 1", "Wave 2", "Wave 3", "Wave 1", "Wave 2",
"Wave 3", "Wave 2", "Wave 3", "Wave 2", "Wave 3", "Wave 1", "Wave 3",
"Wave 1", "Wave 3", "Wave 1", "Wave 2", "Wave 3", "Wave 1", "Wave 2",
"Wave 3", "Wave 1", "Wave 2", "Wave 3", "Wave 2", "Wave 3", "Wave 2",
"Wave 3", "Wave 1", "Wave 3", "Wave 1", "Wave 3"), var2 = c("Skipped",
"Skipped", "Skipped", "Skipped", "Skipped", "Skipped", "Skipped",
"Skipped", "Skipped", "Skipped", "Skipped", "Skipped", "Skipped",
"Skipped", "Skipped", "Skipped", "Skipped", "Attended", "Attended",
"Attended", "Attended", "Attended", "Attended", "Attended", "Attended",
"Attended", "Attended", "Attended", "Attended", "Attended", "Attended",
"Attended", "Attended", "Attended"), avg2 = c(30.21, 20.88, 25.43,
7.68, 8.26, 7.89, 11.15, 8, 5.99, 1.64, 0.43, 0.6, 0.77, 0.01,
-0.09, -0.2, -0.01, 24.01, 19.98, 29.04, 9.82, 12.41, 12.99,
14.35, 11.01, 10, 2.36, 2.3, 1.51, 0.91, -0.23, -0.35, -0.17,
-0.14), outcome_type2 = c("C", "C", "C", "C", "C", "C", "C",
"C", "C", "Z", "Z", "Z", "Z", "SS", "SS", "SS", "SS", "C", "C",
"C", "C", "C", "C", "C", "C", "C", "Z", "Z", "Z", "Z", "SS",
"SS", "SS", "SS")), class = c("spec_tbl_df", "tbl_df", "tbl",
"data.frame"), row.names = c(NA, -34L), spec = structure(list(
cols = list(outcome_type = structure(list(), class = c("collector_character",
"collector")), wave = structure(list(), class = c("collector_character",
"collector")), var2 = structure(list(), class = c("collector_character",
"collector")), avg2 = structure(list(), class = c("collector_double",
"collector")), outcome_type2 = structure(list(), class = c("collector_character",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1L), class = "col_spec"))

One option would be to create separate plots for each group of panels and glue them together using patchwork. Doing so you get "free" scale for each group of panels automatically and also have one (and only one) axis for each panel group.
To this end first add a group column to your data which could be used to split your dataset by facet panel group. Additionally, for convenience I use a plotting function which also removes the y axis strip texts for the first two groups of panels and as an important step completes each dataset so that all combinations of wave, outcome_type and var2 are present in each sub-dataset.
library(ggplot2)
library(patchwork)
library(magrittr)
dtest$group <- dplyr::case_when(
grepl("SS$", dtest$outcome_type) ~ "SS",
grepl("Z$", dtest$outcome_type) ~ "Z",
TRUE ~ "C"
)
dtest$group <- factor(dtest$group, c("C", "Z", "SS"))
plot_fun <- function(.data) {
remove_facet <- if (unique(.data$group) %in% c("C", "Z")) {
theme(strip.text.y = element_blank())
}
.data$outcome_type <- forcats::fct_relevel(
.data$outcome_type,
"CT", "CI", "CE", "FZ", "MZ", "PSS", "CSS"
)
.data |>
tidyr::complete(outcome_type, wave = unique(dtest$wave), var2) %>%
ggplot(aes(x = var2, y = avg2, fill = var2)) +
geom_bar(
stat = "identity",
width = 1
) +
facet_grid(wave ~ outcome_type) +
theme_minimal() +
remove_facet
}
dtest_split <- split(dtest, dtest$group)
lapply(dtest_split, plot_fun) %>%
wrap_plots() +
plot_layout(widths = c(3, 2, 2), guides = "collect") &
labs(x = NULL, y = NULL, fill = NULL) &
theme(axis.text.x = element_blank())
#> Warning: 4 unknown levels in `f`: FZ, MZ, PSS, and CSS
#> Warning: 5 unknown levels in `f`: CT, CI, CE, PSS, and CSS
#> Warning: 5 unknown levels in `f`: CT, CI, CE, FZ, and MZ
#> Warning: Removed 4 rows containing missing values (`position_stack()`).
#> Removed 4 rows containing missing values (`position_stack()`).

Here is a solution where we first identify those avg2 < 5, then make a list of two data frames and plot for each data frame the corresponding plot:
library(tidyverse)
require(gridExtra)
my_list <- dtest %>%
pivot_longer(c(contains("type"))) %>%
mutate(value = fct_relevel(value, "CT", "CI", "CE", "FZ", "MZ", "PSS", "CSS")) %>%
arrange(value) %>%
mutate(x = ifelse(avg2 <5, 1, 0)) %>%
group_split(x)
plot1 <- ggplot(my_list[[1]], aes(x = var2, y = avg2, fill = var2))+
geom_col()+
facet_grid(wave ~ value) +
theme_minimal() +
theme(legend.position = "none",
strip.text.y = element_blank()
)
plot2 <- ggplot(my_list[[2]], aes(x = var2, y = avg2, fill = var2))+
geom_col()+
facet_grid(wave ~ value)+
theme_minimal() +
theme(legend.position = "none")+
labs(y="")
grid.arrange(plot1, plot2, ncol=2)

How to invoke functions on subsets of random samples of data

I'm trying to perform a t.test for a specific subset of data. Say I have a data set of 116 birds, and want to find a random sample of 35 birds (non-unique) of the "Species" category. I then want to find the mean of the "Body.Mass" of these random species. Then, I want to invoke a t.test on this sample as representative of the whole data.
I first stored the data in object "bird." I tried taking the random sample using sample(bird$Species, 35), which yielded 35 random species of bird. Now I can't seem to further subset this random sample to find the means of the Body.Mass of every random sample species. I tried to subset using tidyverse, but that's the only way I'm aware of to solve a problem like this.
library(dplyr)
bird = read.csv("NZBIRDS.csv")
dput(head(bird))
set.seed(20)
sambird = sample(bird$Species,35)
sambird
bmbird <- sambird %>% summarize(avg = mean(Body.Mass))
bmbird
structure(list(Species = c("Grebes", "Grebes", "Petrels", "Petrels",
"Petrels", "Petrels"), Name = c("P. cristatus", "P. rufopectus",
"P. gavia", "P. assimilis", "P. urinatrix", "P. georgicus"),
Extinct = c("No", "No", "Yes", "Yes", "Yes", "No"), Habitat = c("A",
"A", "A", "A", "A", "A"), Nest.Site = c("G", "G", "GC", "GC",
"GC", "GC"), Nest.Density = c("L", "L", "H", "H", "H", "H"
), Diet = c("F", "F", "F", "F", "F", "F"), Flight = c("Yes",
"Yes", "Yes", "Yes", "Yes", "Yes"), Body.Mass = c(1100L,
250L, 300L, 200L, 130L, 120L), Egg.Length = c(57, 43, 57,
54, 38, 39)), .Names = c("Species", "Name", "Extinct", "Habitat",
"Nest.Site", "Nest.Density", "Diet", "Flight", "Body.Mass", "Egg.Length"
), row.names = c(NA, 6L), class = "data.frame")
Error in UseMethod("summarise_") : no applicable method for 'summarise_' applied to an object of class "factor"

It's a bit unclear whether you want to sample from a list of the unique species in the data, or sample rows so that each "Species" type can appear multiple times in the data. If you want to sample from the unique species, you can do:
# Only sampling one species since the example data
# contains only two, should work fine
# for more random species
random_species = sample(unique(bird$Species), 1, replace = FALSE)
bird %>%
filter(Species %in% random_species) %>%
group_by(Species) %>%
summarize(avg = mean(Body.Mass))

How to use the predict function with categorical interaction terms in R

I am trying to use predict() to find prediction values for a linear regression pertaining to kick offs in football. My model is:
fit8 <- lm(endyl ~ gdate + kt + rt+ ylr + gdate*rt + kt*ylr + gdate*kt
+ gdate*kt*ylr, data = returned_kicks)
endyl and ylr are numerical, but gdate, kt, and rt, are all categorical with 4, 32, and 32 levels respectively. As you can see there are 4 interaction terms and they use the categorical predictors. My attempt at predicting looks like this:
newdata1 <- with(returned_kicks, data.frame(gdate = factor('Nov', levels = c('Sept', 'Oct', 'Nov', 'Dec')),
kt = factor("ATL", levels =c("ARI", "ATL", "BAL", "BUF", "CAR", "CHI", "CIN", "CLE", "DAL", "DEN", "DET", "GB", "HOU", "IND", "JAX",
"KC", "LA", "MIA","MIN", "NE", "NO", "NYG", "NYJ", "OAK", "PHI","PIT", "SD", "SEA", "SF", "TB", "TEN", "WAS")),
rt = factor("BUF", levels =c("ARI", "ATL", "BAL", "BUF", "CAR", "CHI", "CIN", "CLE", "DAL", "DEN", "DET", "GB", "HOU", "IND", "JAX",
"KC", "LA", "MIA","MIN", "NE", "NO", "NYG", "NYJ", "OAK", "PHI","PIT", "SD", "SEA", "SF", "TB", "TEN", "WAS")), ylr = -5))
newdata1
predict.lm(fit8, newdata = newdata1, type = "response")
I have seen some places say including the levels like that helps with categorical predictors, but it doesn't seem to be accounting for the interaction terms. Am I doing something wrong in the predict function or am I setting up the data.frame wrong? Any suggestions on how to fix the error are appreciated, thank you.

Error in inference function in frbs package - R

I am trying to implement a fuzzy inference system in R using frbs package.
Here is my code -
varinp.mf <- matrix(c(1,1,1,1,3,1,1,4,3,1,1,1,1,3,1,4,4,3,
0,20,40,70,85,0,5,30,65,0,25,40,70,85,0,20,45,70,
15,35,65,85,95,5,25,45,90,20,30,50,75,95,15,30,50,90,
30,45,75,90,0,10,35,65,90,30,45,75,90,100,25,40,65,100,
0,0,0,0,0,0,0,75,100,0,0,0,0,0,0,50,75,0
), nrow=5, byrow=TRUE)
## Define number of linguistic terms of input variables.
num.fvalinput <- matrix(c(5, 4, 5, 4), nrow=1)
varinput.1 <- c("veryPoor", "Poor", "Average","Good","veryGood")
varinput.2 <- c("Less", "Average", "Many", "aLot")
varinput.3 <- c("veryPoor", "Poor", "Average","Good","veryGood")
varinput.4 <- c("Less", "Average", "More","High")
names.varinput <- c(varinput.1, varinput.2, varinput.3, varinput.4)
## Set interval of data.
range.data <- matrix(c(0, 100, 0, 100, 0, 100, 0, 100, 0, 100), nrow = 2)
## Define inference parameters.
## Detailed information about values can be seen in the inference function.
type.defuz <- "WAM"
type.tnorm <- "MIN"
type.snorm <- "MAX"
type.implication.func <- "ZADEH"
## Give the name of simulation.
name <- "Sim-0"
## the names of variables
colnames.var <- c("Hotel_Facility_Score", "Visited_Count", "Room_facility_score", "Average_price", "Relative_Class")
## Define number of linguistic terms of output variable.
## In this case, we set the number of linguistic terms to 3.
num.fvaloutput <- matrix(c(5), nrow = 1)
## Give the names of the linguistic terms of the output variable.
varoutput.1 <- c("veryGood", "Good", "Average","Poor","veryPoor")
names.varoutput <- c(varoutput.1)
## Define the shapes and parameters of the membership functions of the output variables.
varout.mf <- matrix(c(5,5,5,5,5,
95,75,55,35,20,
4,8,5,7,5,
0,0,0,0,0,
0,0,0,0,0),
nrow = 5, byrow = TRUE)
## Define the fuzzy IF-THEN rules;
rule <- matrix(
c("veryGood", "and", "aLot", "and", "veryGood", "and", "less", "->", "veryGood",
"veryGood", "and", "Many", "and", "veryGood", "and", "Average", "->", "veryGood",
"veryGood", "and", "aLot", "and", "veryGood", "and", "Average", "->", "Good",
"veryGood", "and", "aLot", "and", "veryGood", "and", "Less", "->", "Good",
"Good", "and", "Many", "and", "Good", "and", "Less", "->", "Good",
"Good", "and", "aLot", "and", "Good", "and", "Average", "->", "Good",
"Average", "and", "aLot", "and", "Good", "and", "Less", "->", "Average",
"veryGood", "and", "Average", "and", "veryGood", "and", "More", "->", "Average",
"Good", "and", "Many", "and", "Good", "and", "Average", "->", "Average",
"Average", "and", "Average", "and", "Average", "and", "More", "->", "Poor",
"Good", "and", "Many", "and", "vGood", "and", "High", "->", "Poor",
"Average", "and", "Average", "and", "Average", "and", "High", "->", "Poor",
"Poor", "and", "Less", "and", "Poor","and", "High", "->", "veryPoor",
"veryPoor", "and", "Less", "and", "veryPoor", "and", "High", "->", "veryPoor"),
nrow = 14, byrow = TRUE)
## Set type of model which is "MAMDANI".
type.model <- "MAMDANI"
## Generate a fuzzy model with frbs.gen.
object <- frbs.gen(range.data, num.fvalinput, names.varinput,
num.fvaloutput, varout.mf, names.varoutput, rule,
varinp.mf, type.model, type.defuz, type.tnorm,
type.snorm, func.tsk = NULL, colnames.var, type.implication.func, name)
## Plot the membership function.
plotMF(object)
newdata <- matrix(c(15, 80, 85, 85, 45, 75, 78, 70), nrow = 2, byrow = TRUE)
## Fuzzification Module:
num.varinput <- ncol(num.fvalinput)
MF <- fuzzifier(newdata, num.varinput, num.fvalinput, varinp.mf)
## Check input data given by user.
ruleb <- rulebase(type.model, rule, func.tsk = NULL)
## Inference Module:
miu.rule <- inference(MF, ruleb, names.varinput, type.tnorm, type.snorm)
I am getting this error -
miu.rule <- inference(MF, ruleb, names.varinput, type.tnorm, type.snorm)
Error in MF[k, temp[j + 2]] : subscript out of bounds
I took help from the example give at this link -
http://www.inside-r.org/packages/cran/frbs/docs/frbs.gen
But the example runs fine. I am not able to find what is the error in my code.

I found a way to block the error: decrease your rule list down to 3 elements as in the example :-(.
But still don't know why!
BTW creating some additional rules (up to 5) in the example doesn't trigger error.
Edit 16/07/19: I don't retest the whole stuff but this is surely linked to the data/structure format.
Look the data/structure format of the example and stick thoroughly to this format for every data/structure you create and send to the package. This should solve your current error.

to calculate the Entropy

I am new to R and unable to calculate the entropy.
There is a similar question on stackoverflow with the answer but i wanted to know why this code isn't working. Here is the copy paste data from the same question.
One of the answer mentions, "The part I think you are missing is the calculation of the class frequencies and you will get your answer", but how do i fix this. I tried most of the options but still i don't get any output. It just runs without any errors.
info <- function(CLASS.FREQ){
freq.class <- CLASS.FREQ
info <- 0
for(i in 1:length(freq.class)){
if(freq.class[[i]] != 0){ # zero check in class
entropy <- -sum(freq.class[[i]] * log2(freq.class[[i]])) #I calculate the entropy for each class i here
}else{
entropy <- 0
}
info <- info + entropy # sum up entropy from all classes
}
return(info)
}
Dataset as below,
buys <- c("no", "no", "yes", "yes", "yes", "no", "yes", "no", "yes", "yes", "yes", "yes", "yes", "no")
credit <- c("fair", "excellent", "fair", "fair", "fair", "excellent", "excellent", "fair", "fair", "fair", "excellent", "excellent", "fair", "excellent")
student <- c("no", "no", "no","no", "yes", "yes", "yes", "no", "yes", "yes", "yes", "no", "yes", "no")
income <- c("high", "high", "high", "medium", "low", "low", "low", "medium", "low", "medium", "medium", "medium", "high", "medium")
age <- c(25, 27, 35, 41, 48, 42, 36, 29, 26, 45, 23, 33, 37, 44)
we change the age from categorical to numeric
Cheers, Jack

You need to calculate the propertion of "no" and "yes" in "buys", the proportion of "fair" and "excellent" in "credit", and so on. Here is one way to do it:
data <- list(
buys = c("no", "no", "yes", "yes", "yes", "no", "yes", "no", "yes", "yes", "yes", "yes", "yes", "no"),
credit = c("fair", "excellent", "fair", "fair", "fair", "excellent", "excellent", "fair", "fair", "fair", "excellent", "excellent", "fair", "excellent"),
student = c("no", "no", "no","no", "yes", "yes", "yes", "no", "yes", "yes", "yes", "no", "yes", "no"),
income = c("high", "high", "high", "medium", "low", "low", "low", "medium", "low", "medium", "medium", "medium", "high", "medium"),
age = c(25, 27, 35, 41, 48, 42, 36, 29, 26, 45, 23, 33, 37, 44)
)
freq <- lapply( data, function(x){rowMeans(outer(unique(x),x,"=="))})
.
> freq
$buys
[1] 0.3571429 0.6428571
$credit
[1] 0.5714286 0.4285714
$student
[1] 0.5 0.5
$income
[1] 0.2857143 0.4285714 0.2857143
$age
[1] 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857
[14] 0.07142857
Such a proportion can never be 0. So change if(freq.class[[i]] != 0){ # zero check in class to if(length(freq.class[[i]]) != 0){ # zero check in class:
info <- function(CLASS.FREQ){
freq.class <- CLASS.FREQ
info <- 0
for(i in 1:length(freq.class)){
if(length(freq.class[[i]]) != 0){ # zero check in class
entropy <- -sum(freq.class[[i]] * log2(freq.class[[i]])) #I calculate the entropy for each class i here
}else{
entropy <- 0
}
info <- info + entropy # sum up entropy from all classes
}
return(info)
}
.
> info(freq)
[1] 8.289526
> info(freq$buys)
[1] 0.940286
> info(freq$age)
[1] 3.807355
>

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Editing loop to append output by variable - r

Related

Free axis along particular facets in facet_grid ggplot

How to invoke functions on subsets of random samples of data

How to use the predict function with categorical interaction terms in R

Error in inference function in frbs package - R

to calculate the Entropy

Categories

Resources