Cannot make a logistic curve in R

Cannot make a logistic curve in R - r

I am trying to make a logistic curve in R but the line does not appear in the plot.
My data are:
dput(los1)
structure(list(X1 = c("5.51688462301445", "2.55660506920185",
"4.17130300764484", "15.0032350113684", "0.0672790807684578",
"0", "10.7646529229551", "1.6819770192119", "4.44041933071867",
"2.69116323073877", "0", "0.740069888453036", "1.54741885767498",
"0.201837242305373", "1.81653518074882", "6.12239634993057",
"3.49851219996026", "22.4039338958996", "0.538232646147662",
"0.134558161536916", "1.2783025346007", "1.6819770192119", "16.9543283536541",
"60.0129400454734", "9.62090854989083", "0.470953565379205",
"33.7740985457708", "6.8624662383836", "0", "0", "4.50769841148758",
"62.6368241954438", "264.137671097005", "14.5995605267576", "0",
"0", "0", "6.12239634993057", "10.1591411960385", "22.9421665420477",
"0.470953565379205", "2.28748874612802", "13.8594906383046",
"11.0337692460289", "18.6363053728655", "27.2480277112295", "0.0672790807684578",
"0.470953565379205", "0", "0"), X2 = c("No", "No", "Yes", "No",
"No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No",
"No", "No", "No", "No", "No", "No", "No", "No", "Yes", "Yes",
"No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No",
"No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No",
"No", "No", "No")), row.names = c(NA, 50L), class = "data.frame")
and the code I use for the curve is:
los1 %>%
mutate(prob = ifelse(X2 == "Yes", 1, 0)) %>%
ggplot(aes(X1, prob)) +
geom_point(alpha = 0.2) +
geom_smooth(method = "lm", se=FALSE, method.args = list(family = "binomial")) +
labs(
title = "Logistic Regression Model",
x = "Plasma Glucose Concentration",
y = "Probability of being diabete-pos"
)
Any idea how to make the curve?

Another option using stat_smooth with "glm" method and "X1" converted to numeric like this:
library(tidyverse)
los1 %>%
mutate(prob = ifelse(X2 == "Yes", 1, 0)) %>%
mutate(X1 = as.numeric(X1)) %>%
ggplot(aes(X1, prob)) +
geom_point(alpha = 0.2) +
stat_smooth(method="glm", color="green", se=FALSE, method.args = list(family=binomial)) +
labs(
title = "Logistic Regression Model",
x = "Plasma Glucose Concentration",
y = "Probability of being diabete-pos"
)
#> `geom_smooth()` using formula 'y ~ x'
Created on 2022-08-30 with reprex v2.0.2

los1 <- structure(list(X1 = c("5.51688462301445", "2.55660506920185",
"4.17130300764484", "15.0032350113684", "0.0672790807684578",
"0", "10.7646529229551", "1.6819770192119", "4.44041933071867",
"2.69116323073877", "0", "0.740069888453036", "1.54741885767498",
"0.201837242305373", "1.81653518074882", "6.12239634993057",
"3.49851219996026", "22.4039338958996", "0.538232646147662",
"0.134558161536916", "1.2783025346007", "1.6819770192119", "16.9543283536541",
"60.0129400454734", "9.62090854989083", "0.470953565379205",
"33.7740985457708", "6.8624662383836", "0", "0", "4.50769841148758",
"62.6368241954438", "264.137671097005", "14.5995605267576", "0",
"0", "0", "6.12239634993057", "10.1591411960385", "22.9421665420477",
"0.470953565379205", "2.28748874612802", "13.8594906383046",
"11.0337692460289", "18.6363053728655", "27.2480277112295", "0.0672790807684578",
"0.470953565379205", "0", "0"), X2 = c("No", "No", "Yes", "No",
"No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No",
"No", "No", "No", "No", "No", "No", "No", "No", "Yes", "Yes",
"No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No",
"No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No",
"No", "No", "No")), row.names = c(NA, 50L), class = "data.frame")
str(los1)
los1$X1 <- as.numeric(los1$X1)
los1$Y <- ifelse(los1$X2 == "Yes", 1, 0)
library(ggplot2)
los1 |>
ggplot(aes(X1, Y)) +
geom_point(alpha = 0.2) +
geom_smooth(method = "glm", se=FALSE, method.args = list(family = "binomial")) +
labs(
title = "Logistic Regression Model",
x = "Plasma Glucose Concentration",
y = "Probability of being diabete-pos"
)

Related

Running Fishers discriminate test in R

I have tried the following code to try and run fishers discriminant on my data set but it doesn't like the negative values.
Split_user <- user_col%>%
select(-Heroin)%>%
filter(User == "Yes")
Split_user1 <- Split_user%>%
select(-User)
Split_nonuser <- user_col%>%
select(-Heroin)%>%
filter(User == "No")
Split_nonuser1 <- Split_user%>%
select(-User)
#Calculate the Mean by class
m1 = colMeans(Split_user1)
m1
m2 = colMeans(Split_nonuser1)
m2
#Calculate the covariances by class
S1 = cov(Split_user1)
S2 = cov(Split_nonuser1)
dat <- data.frame(
"User"=c(m1),
"Non User" = c(m2),
stringsAsFactors = FALSE
)
dat
test <- fisher.test(dat)
test
structure(list(Nscore = c(0.31287, -0.67825, -0.46725, -0.14882,
0.73545, -0.67825, -0.46725, -1.32828, 0.62967, -0.24649, -1.05308,
-1.32828, 2.28554, -0.79151, -0.92104, -2.05048, -1.55078, 0.52135,
1.37297, -0.34799, -0.79151, -1.1943, 0.41667, 1.60383, -0.14882
), Escore = c(-0.57545, 1.93886, 0.80523, -0.80615, -1.6334,
-0.30033, -1.09207, 1.93886, 2.57309, 0.00332, 0.80523, 0.00332,
0.16767, 0.80523, 1.45421, -1.50796, -0.80615, -1.23177, -0.15487,
-1.7625, 0.80523, 0.47617, -0.94779, -3.27393, 0.63779), Oscore = c(-0.58331,
1.43533, -0.84732, -0.01928, -0.45174, -1.55521, -0.45174, -0.84732,
-0.97631, -1.42424, -1.11902, 0.14143, 0.44585, -0.01928, 0.44585,
-1.55521, -1.68062, -0.31776, -0.17779, -2.39883, 0.7233, -1.11902,
-0.84732, -1.27553, 1.24033), Ascore = c(-0.91699, 0.76096, -1.6209,
0.59042, -0.30172, 2.03972, -0.30172, -0.30172, 0.76096, 0.59042,
-0.76096, -1.92595, -1.6209, 0.94156, -0.60633, -1.07533, 0.28783,
-0.45321, -1.92595, -1.92595, 1.61108, -0.60633, 1.11406, 0.28783,
0.76096), Cscore = c(-0.00665, -0.14277, -1.0145, 0.58489, 1.30612,
1.63088, 0.93949, 1.63088, 1.13407, 0.12331, 1.81175, -0.52745,
-0.78155, 3.46436, 1.63088, 1.13407, 0.7583, -1.38502, -1.5184,
0.7583, -1.13788, 1.81175, -0.89891, -1.0145, 1.46191), Impulsivity = c(-0.21712,
-0.71126, -1.37983, -1.37983, -0.21712, -1.37983, -0.21712, 0.19268,
-1.37983, -1.37983, 0.19268, 0.52975, 1.29221, -0.71126, 1.29221,
-0.71126, -0.21712, -1.37983, -0.71126, -1.37983, 0.19268, -0.21712,
-0.71126, -1.37983, -0.21712), SS = c(-1.18084, -0.21575, 0.40148,
-1.18084, -0.21575, -1.54858, 0.07987, -0.52593, -1.54858, -0.84637,
0.07987, 1.2247, 0.07987, -0.84637, 0.7654, -0.52593, -2.07848,
-0.84637, -0.21575, -2.07848, -0.21575, -1.18084, 0.07987, -1.54858,
-0.52593), Heroin = c("CL0", "CL0", "CL0", "CL0", "CL0", "CL0",
"CL0", "CL0", "CL0", "CL0", "CL0", "CL0", "CL0", "CL0", "CL0",
"CL0", "CL0", "CL0", "CL0", "CL0", "CL0", "CL0", "CL0", "CL1",
"CL0"), User = c("No", "No", "No", "No", "No", "No", "No", "No",
"No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No",
"No", "No", "No", "No", "No", "No")), row.names = c(NA, -25L), class = c("tbl_df",
"tbl", "data.frame"))
matrix - non negative enteries
structure(c(0.610545701853111, 0.96447451044118, 0.851040908078699,
0.589544977621028, 0.827711005318391, 0.904054295338232, 0.589544977621028,
0.456502942067452, 0.688702581355738, 0.533249474881585, 0.389562632031922,
0.456502942067452, 0.632276676717644, 0.436140184047234, 0.653498081085907,
0.166581403886779, 0.495545270523754, 0.476097331977154, 0.671587833703474,
0.533249474881585, 0.436140184047234, 0.71082275573317, 0.610545701853111,
0.550877805396231, 0.495545270523754, 0.606851082126989, 0.606851082126989,
0.529890274393643, 0.17257337601325, 0.40854553851544, 0.238467663593076,
0.327676651856099, 0.580185689487682, 0.367919991081809, 0.479144172121096,
1, 0.505317471692705, 0.453951873616486, 0.0885974551303489,
0.259420636058159, 0.505317471692705, 0.63189368241683, 0.386973070246684,
0, 0.327676651856099, 0.479144172121096, 0.327676651856099, 0.386973070246684,
0.350233306260252, 0.63189368241683, 0.535574626985272, 0.563618843854673,
0.750572944212062, 0.223135727679215, 0.4774929083129, 0.706259916342132,
0.355063144061413, 0.275029248201035, 0.667195538247031, 0.563618843854673,
0.850078930076767, 0.223135727679215, 0.796364408545283, 0.163192941968364,
0.223135727679215, 0.355063144061413, 0.163192941968364, 0.595629597576807,
0.4774929083129, 0.223135727679215, 0.535574626985272, 0.706259916342132,
0.508037245380387, 0.383103354327933, 0.414857685465647, 0.403077115158141,
0.233825658261037, 0.558377116170461, 0.712895943634026, 0.210211608610118,
0.285534960468907, 0.658493870402802, 0.429792238205629, 0.533600585795802,
0.210211608610118, 0.482206789967235, 0.684122436721568, 0.285534960468907,
0.456117618078684, 0.533600585795802, 0.508041194672498, 0.18731799331194,
0.456117618078684, 0.357934394919504, 0.712895943634026, 0.558377116170461,
0.558377116170461, 0.156244832950339, 0.581589612247721, 0.658493870402802,
0.318731025010843, 0.202385427208327, 0.202385427208327, 0.249965664305335,
0.406440653462484, 0.599528335983808, 0.498561515107706, 0.571262830706954,
0.571262830706954, 0.788549949400029, 0.694141246205002, 0.694141246205002,
0.729304611825936, 0.474472314587249, 0.429044383403209, 0.694141246205002,
0.36191629319069, 0.474472314587249, 0.154790732976724, 0.474472314587249,
0.429044383403209, 0.297072430244326, 0.474472314587249, 0.154790732976724,
0.406440653462484, 0.446013490788146, 0.757189169998879, 1, 0,
0.528084009118427, 0.446013490788146, 0.271569845659404, 0.624098434171681,
0.271569845659404, 0.528084009118427, 0.757189169998879, 0.528084009118427,
0.446013490788146, 0.367285305878396, 0.156155405657909, 0.271569845659404,
0.624098434171681, 0.757189169998879, 0.528084009118427, 0, 0.367285305878396,
0.528084009118427, 0.528084009118427, 0.156155405657909, 0.528084009118427,
0.825751648038478, 1, 0.619957452233758, 0.308011329405206, 0.619957452233758,
0.53955917314341, 0.710932676034508, 0.46565805295222, 0.53955917314341,
1, 0.825751648038478, 0.619957452233758, 0.825751648038478, 0.710932676034508,
0.619957452233758, 0.53955917314341, 1, 0.710932676034508, 0.710932676034508,
0.825751648038478, 0.825751648038478, 0.619957452233758, 0.46565805295222,
0, 0.825751648038478), dim = c(25L, 7L), dimnames = list(c("30",
"67", "115", "20", "3", "18", "16", "8", "10", "77", "84", "71",
"54", "64", "85", "105", "58", "2", "102", "17", "79", "87",
"63", "11", "39"), c("Nscore", "Escore", "Oscore", "Ascore",
"Cscore", "Impulsivity", "SS")))

Add hatch on geom_segment within the gggenes in ggplot2 [R]

I have a dataframe such as :
> head(tab)
molecule gene start end strand orientation hatch
1 Genome5 genA 405113 407035 forward -1 NO
2 Genome5 genB 407035 407916 forward -1 NO
3 Genome5 genC 407927 408394 forward -1 NO
4 Genome5 genD 408387 408737 reverse -1 NO
5 Genome5 genE 408751 409830 forward 1 NO
6 Genome5 genF 409836 410315 forward -1 NO
And I can produce the following plot using this script :
library(ggplot2)
library(gggenes)
ggplot(example_genes, aes(xmin = start, xmax = end, y = molecule, fill = gene)) +
geom_gene_arrow() +
facet_wrap(~ molecule, scales = "free", ncol = 1) +
scale_fill_brewer(palette = "Set3")
And I would like simply to add hatchs on genes containing hatch=="YES" within the tab$hatch column in the tab
And then get the following result ;
Here is the dput format of the table :
structure(list(molecule = c("Genome5", "Genome5", "Genome5",
"Genome5", "Genome5", "Genome5", "Genome5", "Genome5", "Genome5",
"Genome5", "Genome3", "Genome3", "Genome3", "Genome3", "Genome3",
"Genome3", "Genome3", "Genome3", "Genome4", "Genome4", "Genome4",
"Genome4", "Genome4", "Genome4", "Genome2", "Genome2", "Genome2",
"Genome2", "Genome2", "Genome2", "Genome2", "Genome2", "Genome1",
"Genome1", "Genome1", "Genome1", "Genome1", "Genome1", "Genome1",
"Genome1", "Genome1", "Genome1", "Genome6", "Genome6", "Genome6",
"Genome6", "Genome6", "Genome6", "Genome6", "Genome6", "Genome7",
"Genome7", "Genome7", "Genome7", "Genome7", "Genome7", "Genome7",
"Genome7", "Genome7", "Genome7", "Genome7", "Genome8", "Genome8",
"Genome8", "Genome8", "Genome8", "Genome8", "Genome8", "Genome8",
"Genome8", "Genome8", "Genome8"), gene = c("genA", "genB", "genC",
"genD", "genE", "genF", "protF", "protC", "protD", "protE", "genA",
"genB", "genC", "genD", "genE", "genF", "protA", "protB", "genA",
"genB", "genC", "genD", "genE", "genF", "genA", "genB", "genC",
"genD", "genE", "genF", "protA", "protB", "genA", "genB", "genC",
"genD", "genE", "genF", "protF", "protC", "protD", "protE", "genA",
"genB", "genC", "genD", "genE", "genF", "protA", "protB", "genB",
"genC", "genD", "genE", "genF", "protA", "protB", "protF", "protC",
"protD", "protE", "genB", "genC", "genD", "genE", "genF", "protA",
"protB", "protF", "protC", "protD", "protE"), start = c(405113,
407035, 407927, 408387, 408751, 409836, 410335, 412621, 412830,
413867, -67849, -65867, -64997, -64507, -64127, -63011, -62550,
-62187, -47353, -45431, -44522, -44070, -43701, -42614, 8345,
10327, 11394, 11878, 12258, 13365, 13726, 14260, 15389, 17301,
18176, 18641, 18999, 20086, 20474, 22777, 22986, 24024, 65751,
67698, 68605, 69128, 69501, 70614, 71008, 71375, -9390, -8984,
-8500, -8130, -7019, -6662, -6306, -5695, -3446, -3188, -2116,
2, 413, 898, 1268, 2376, 2733, 3089, 3700, 5949, 6217, 7307),
end = c(407035, 407916, 408394, 408737, 409830, 410315, 412596,
412833, 413870, 414850, -65864, -65013, -64548, -64127, -63048,
-62640, -62209, -61549, -45443, -44571, -44070, -43723, -42625,
-42201, 10330, 11181, 11843, 12255, 13337, 13733, 14067,
14919, 17299, 18161, 18640, 18985, 20078, 20451, 22720, 22989,
24023, 25010, 67691, 68570, 69135, 69511, 70583, 71015, 71349,
72034, -8992, -8511, -8123, -7048, -6663, -6321, -5653, -3449,
-3207, -2136, -1127, 406, 886, 1275, 2350, 2732, 3074, 3742,
5946, 6182, 7269, 8296), strand = c("forward", "forward",
"forward", "reverse", "forward", "forward", "reverse", "forward",
"forward", "forward", "reverse", "reverse", "reverse", "forward",
"reverse", "reverse", "reverse", "reverse", "reverse", "reverse",
"forward", "reverse", "forward", "forward", "forward", "forward",
"forward", "forward", "forward", "reverse", "forward", "reverse",
"reverse", "forward", "reverse", "forward", "reverse", "forward",
"forward", "forward", "forward", "forward", "forward", "forward",
"reverse", "forward", "reverse", "forward", "forward", "forward",
"reverse", "forward", "reverse", "reverse", "forward", "reverse",
"forward", "forward", "reverse", "reverse", "forward", "forward",
"forward", "forward", "forward", "forward", "reverse", "forward",
"forward", "reverse", "reverse", "reverse"), orientation = c(-1,
-1, -1, -1, 1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, 1,
-1, 1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1, 1, 1, 1, 1, -1,
1, -1, 1, 1, -1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1,
1, -1, -1, -1, -1, 1, -1, -1, 1, 1, 1, 1, 1, 1, -1, -1, -1,
-1, -1, 1, 1, -1), hatch = c("NO", "NO", "NO", "NO", "NO",
"NO", "NO", "NO", "NO", "NO", "YES", "NO", "NO", "NO", "NO",
"NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",
"NO", "NO", "NO", "NO", "NO", "NO", "NO", "YES", "NO", "NO",
"NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",
"NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",
"NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",
"NO", "NO", "NO", "NO", "NO", "NO", "NO")), row.names = c(NA,
-72L), class = "data.frame")

A graph with two values as points and mean value with other shape

 Problem:
I can't find the right way to make a plot with values from a given variable with points and plot the value of the mean with another different shape. So far I find a way of doing this, but mean value appears in the color legend also which is something I don't want to. How could I get the desired output? Should I use stat_summary?
NOTE: Variables must be ordered by the mean value among groups by multimorbidity (if it is something important for the solution proposed) this is why I am using reorder_within and scale_x_reordered.
source("https://raw.githubusercontent.com/dgrtwo/drlib/master/R/reorder_within.R")
library(tidyverse)
foo %>%
group_by(multimorbidity, variables) %>%
mutate(Mean = mean(varimportance),
aux_mean = Mean) %>%
ungroup() %>%
spread(Gender, varimportance) %>%
gather(Gender, varimportance, -multimorbidity, -variables, -aux_mean) %>%
mutate(type = if_else(Gender %in% c("Male", "Female"), "Gender", "Mean")) %>%
ggplot(aes(reorder_within(variables, aux_mean, multimorbidity), varimportance,
color = Gender, shape = type)) +
geom_point() +
scale_x_reordered() +
scale_shape_manual(values = c(21, 24)) +
coord_flip() +
facet_wrap(multimorbidity~., scales = "free")
Created on 2019-03-20 by the reprex package (v0.2.1)
The desired output:
dput for foo:
foo <- structure(list(
Gender = c(
"Male", "Male", "Male", "Male", "Male",
"Female", "Female", "Female", "Female", "Female", "Female", "Female",
"Female", "Female", "Female", "Male", "Male", "Male", "Male",
"Male"
), multimorbidity = c(
"Yes", "Yes", "Yes", "Yes", "Yes",
"No", "No", "No", "No", "No", "Yes", "Yes", "Yes", "Yes", "Yes",
"No", "No", "No", "No", "No"
), variables = c(
"bmi", "income",
"soccap", "alternattr", "occhaz", "bmi", "income", "soccap",
"alternattr", "occhaz", "bmi", "income", "soccap", "alternattr",
"occhaz", "bmi", "income", "soccap", "alternattr", "occhaz"
),
varimportance = c(
73.1234145437324, 51.0029811829917, 100,
0, 90.9926659603591, 81.1949541852942, 48.2402164701156,
100, 0, 9.10509052698692, 66.7759248406279, 31.69991730502,
100, 4.7914221037359, 93.4636133674693, 70.8853809607131,
75.004433319282, 100, 0, 43.7326141975936
)
), class = c(
"tbl_df",
"tbl", "data.frame"
), row.names = c(NA, -20L))

to calculate the Entropy

I am new to R and unable to calculate the entropy.
There is a similar question on stackoverflow with the answer but i wanted to know why this code isn't working. Here is the copy paste data from the same question.
One of the answer mentions, "The part I think you are missing is the calculation of the class frequencies and you will get your answer", but how do i fix this. I tried most of the options but still i don't get any output. It just runs without any errors.
info <- function(CLASS.FREQ){
freq.class <- CLASS.FREQ
info <- 0
for(i in 1:length(freq.class)){
if(freq.class[[i]] != 0){ # zero check in class
entropy <- -sum(freq.class[[i]] * log2(freq.class[[i]])) #I calculate the entropy for each class i here
}else{
entropy <- 0
}
info <- info + entropy # sum up entropy from all classes
}
return(info)
}
Dataset as below,
buys <- c("no", "no", "yes", "yes", "yes", "no", "yes", "no", "yes", "yes", "yes", "yes", "yes", "no")
credit <- c("fair", "excellent", "fair", "fair", "fair", "excellent", "excellent", "fair", "fair", "fair", "excellent", "excellent", "fair", "excellent")
student <- c("no", "no", "no","no", "yes", "yes", "yes", "no", "yes", "yes", "yes", "no", "yes", "no")
income <- c("high", "high", "high", "medium", "low", "low", "low", "medium", "low", "medium", "medium", "medium", "high", "medium")
age <- c(25, 27, 35, 41, 48, 42, 36, 29, 26, 45, 23, 33, 37, 44)
we change the age from categorical to numeric
Cheers, Jack

You need to calculate the propertion of "no" and "yes" in "buys", the proportion of "fair" and "excellent" in "credit", and so on. Here is one way to do it:
data <- list(
buys = c("no", "no", "yes", "yes", "yes", "no", "yes", "no", "yes", "yes", "yes", "yes", "yes", "no"),
credit = c("fair", "excellent", "fair", "fair", "fair", "excellent", "excellent", "fair", "fair", "fair", "excellent", "excellent", "fair", "excellent"),
student = c("no", "no", "no","no", "yes", "yes", "yes", "no", "yes", "yes", "yes", "no", "yes", "no"),
income = c("high", "high", "high", "medium", "low", "low", "low", "medium", "low", "medium", "medium", "medium", "high", "medium"),
age = c(25, 27, 35, 41, 48, 42, 36, 29, 26, 45, 23, 33, 37, 44)
)
freq <- lapply( data, function(x){rowMeans(outer(unique(x),x,"=="))})
.
> freq
$buys
[1] 0.3571429 0.6428571
$credit
[1] 0.5714286 0.4285714
$student
[1] 0.5 0.5
$income
[1] 0.2857143 0.4285714 0.2857143
$age
[1] 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857
[14] 0.07142857
Such a proportion can never be 0. So change if(freq.class[[i]] != 0){ # zero check in class to if(length(freq.class[[i]]) != 0){ # zero check in class:
info <- function(CLASS.FREQ){
freq.class <- CLASS.FREQ
info <- 0
for(i in 1:length(freq.class)){
if(length(freq.class[[i]]) != 0){ # zero check in class
entropy <- -sum(freq.class[[i]] * log2(freq.class[[i]])) #I calculate the entropy for each class i here
}else{
entropy <- 0
}
info <- info + entropy # sum up entropy from all classes
}
return(info)
}
.
> info(freq)
[1] 8.289526
> info(freq$buys)
[1] 0.940286
> info(freq$age)
[1] 3.807355
>

Calculating Entropy

I've tried for several hours to calculate the Entropy and I know I'm missing something. Hopefully someone here can give me an idea!
EDIT: I think my formula is wrong!
CODE:
info <- function(CLASS.FREQ){
freq.class <- CLASS.FREQ
info <- 0
for(i in 1:length(freq.class)){
if(freq.class[[i]] != 0){ # zero check in class
entropy <- -sum(freq.class[[i]] * log2(freq.class[[i]])) #I calculate the entropy for each class i here
}else{
entropy <- 0
}
info <- info + entropy # sum up entropy from all classes
}
return(info)
}
I hope my post is clear, since it's the first time I actually post here.
This is my dataset:
buys <- c("no", "no", "yes", "yes", "yes", "no", "yes", "no", "yes", "yes", "yes", "yes", "yes", "no")
credit <- c("fair", "excellent", "fair", "fair", "fair", "excellent", "excellent", "fair", "fair", "fair", "excellent", "excellent", "fair", "excellent")
student <- c("no", "no", "no","no", "yes", "yes", "yes", "no", "yes", "yes", "yes", "no", "yes", "no")
income <- c("high", "high", "high", "medium", "low", "low", "low", "medium", "low", "medium", "medium", "medium", "high", "medium")
age <- c(25, 27, 35, 41, 48, 42, 36, 29, 26, 45, 23, 33, 37, 44) # we change the age from categorical to numeric

Ultimately I find no error in your code as it runs without error. The part I think you are missing is the calculation of the class frequencies and you will get your answer. Quickly running through the different objects you provide I suspect you are looking at buys.
buys <- c("no", "no", "yes", "yes", "yes", "no", "yes", "no", "yes", "yes", "yes", "yes", "yes", "no")
freqs <- table(buys)/length(buys)
info(freqs)
[1] 0.940286
As a matter of improving your code, you can simplify this dramatically as you don't need a loop if you are provided a vector of class frequencies.
For example:
# calculate shannon-entropy
-sum(freqs * log2(freqs))
[1] 0.940286
As a side note, the function entropy.empirical is in the entropy package where you set the units to log2 allowing some more flexibility. Example:
entropy.empirical(freqs, unit="log2")
[1] 0.940286

There is an another way similar to above answer but using a different function.
> buys <- c("no", "no", "yes", "yes", "yes", "no", "yes", "no", "yes", "yes", "yes", "yes", "yes", "no")
> probabilities <- prop.table(table(buys))
> probabilities
buys
no yes
0.3571429 0.6428571
> -sum(probabilities*log2(probabilities))
[1] 0.940286
Also there is a built in function entropy.empirical(probabilities, unit = "log2")

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Cannot make a logistic curve in R - r

Related

Running Fishers discriminate test in R

Add hatch on geom_segment within the gggenes in ggplot2 [R]

A graph with two values as points and mean value with other shape

to calculate the Entropy

Calculating Entropy

Categories

Resources