I am trying to make a logistic curve in R but the line does not appear in the plot.
My data are:
dput(los1)
structure(list(X1 = c("5.51688462301445", "2.55660506920185",
"4.17130300764484", "15.0032350113684", "0.0672790807684578",
"0", "10.7646529229551", "1.6819770192119", "4.44041933071867",
"2.69116323073877", "0", "0.740069888453036", "1.54741885767498",
"0.201837242305373", "1.81653518074882", "6.12239634993057",
"3.49851219996026", "22.4039338958996", "0.538232646147662",
"0.134558161536916", "1.2783025346007", "1.6819770192119", "16.9543283536541",
"60.0129400454734", "9.62090854989083", "0.470953565379205",
"33.7740985457708", "6.8624662383836", "0", "0", "4.50769841148758",
"62.6368241954438", "264.137671097005", "14.5995605267576", "0",
"0", "0", "6.12239634993057", "10.1591411960385", "22.9421665420477",
"0.470953565379205", "2.28748874612802", "13.8594906383046",
"11.0337692460289", "18.6363053728655", "27.2480277112295", "0.0672790807684578",
"0.470953565379205", "0", "0"), X2 = c("No", "No", "Yes", "No",
"No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No",
"No", "No", "No", "No", "No", "No", "No", "No", "Yes", "Yes",
"No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No",
"No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No",
"No", "No", "No")), row.names = c(NA, 50L), class = "data.frame")
and the code I use for the curve is:
los1 %>%
mutate(prob = ifelse(X2 == "Yes", 1, 0)) %>%
ggplot(aes(X1, prob)) +
geom_point(alpha = 0.2) +
geom_smooth(method = "lm", se=FALSE, method.args = list(family = "binomial")) +
labs(
title = "Logistic Regression Model",
x = "Plasma Glucose Concentration",
y = "Probability of being diabete-pos"
)
Any idea how to make the curve?
Another option using stat_smooth with "glm" method and "X1" converted to numeric like this:
library(tidyverse)
los1 %>%
mutate(prob = ifelse(X2 == "Yes", 1, 0)) %>%
mutate(X1 = as.numeric(X1)) %>%
ggplot(aes(X1, prob)) +
geom_point(alpha = 0.2) +
stat_smooth(method="glm", color="green", se=FALSE, method.args = list(family=binomial)) +
labs(
title = "Logistic Regression Model",
x = "Plasma Glucose Concentration",
y = "Probability of being diabete-pos"
)
#> `geom_smooth()` using formula 'y ~ x'
Created on 2022-08-30 with reprex v2.0.2
los1 <- structure(list(X1 = c("5.51688462301445", "2.55660506920185",
"4.17130300764484", "15.0032350113684", "0.0672790807684578",
"0", "10.7646529229551", "1.6819770192119", "4.44041933071867",
"2.69116323073877", "0", "0.740069888453036", "1.54741885767498",
"0.201837242305373", "1.81653518074882", "6.12239634993057",
"3.49851219996026", "22.4039338958996", "0.538232646147662",
"0.134558161536916", "1.2783025346007", "1.6819770192119", "16.9543283536541",
"60.0129400454734", "9.62090854989083", "0.470953565379205",
"33.7740985457708", "6.8624662383836", "0", "0", "4.50769841148758",
"62.6368241954438", "264.137671097005", "14.5995605267576", "0",
"0", "0", "6.12239634993057", "10.1591411960385", "22.9421665420477",
"0.470953565379205", "2.28748874612802", "13.8594906383046",
"11.0337692460289", "18.6363053728655", "27.2480277112295", "0.0672790807684578",
"0.470953565379205", "0", "0"), X2 = c("No", "No", "Yes", "No",
"No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No",
"No", "No", "No", "No", "No", "No", "No", "No", "Yes", "Yes",
"No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No",
"No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No",
"No", "No", "No")), row.names = c(NA, 50L), class = "data.frame")
str(los1)
los1$X1 <- as.numeric(los1$X1)
los1$Y <- ifelse(los1$X2 == "Yes", 1, 0)
library(ggplot2)
los1 |>
ggplot(aes(X1, Y)) +
geom_point(alpha = 0.2) +
geom_smooth(method = "glm", se=FALSE, method.args = list(family = "binomial")) +
labs(
title = "Logistic Regression Model",
x = "Plasma Glucose Concentration",
y = "Probability of being diabete-pos"
)
Related
I have tried the following code to try and run fishers discriminant on my data set but it doesn't like the negative values.
Split_user <- user_col%>%
select(-Heroin)%>%
filter(User == "Yes")
Split_user1 <- Split_user%>%
select(-User)
Split_nonuser <- user_col%>%
select(-Heroin)%>%
filter(User == "No")
Split_nonuser1 <- Split_user%>%
select(-User)
#Calculate the Mean by class
m1 = colMeans(Split_user1)
m1
m2 = colMeans(Split_nonuser1)
m2
#Calculate the covariances by class
S1 = cov(Split_user1)
S2 = cov(Split_nonuser1)
dat <- data.frame(
"User"=c(m1),
"Non User" = c(m2),
stringsAsFactors = FALSE
)
dat
test <- fisher.test(dat)
test
structure(list(Nscore = c(0.31287, -0.67825, -0.46725, -0.14882,
0.73545, -0.67825, -0.46725, -1.32828, 0.62967, -0.24649, -1.05308,
-1.32828, 2.28554, -0.79151, -0.92104, -2.05048, -1.55078, 0.52135,
1.37297, -0.34799, -0.79151, -1.1943, 0.41667, 1.60383, -0.14882
), Escore = c(-0.57545, 1.93886, 0.80523, -0.80615, -1.6334,
-0.30033, -1.09207, 1.93886, 2.57309, 0.00332, 0.80523, 0.00332,
0.16767, 0.80523, 1.45421, -1.50796, -0.80615, -1.23177, -0.15487,
-1.7625, 0.80523, 0.47617, -0.94779, -3.27393, 0.63779), Oscore = c(-0.58331,
1.43533, -0.84732, -0.01928, -0.45174, -1.55521, -0.45174, -0.84732,
-0.97631, -1.42424, -1.11902, 0.14143, 0.44585, -0.01928, 0.44585,
-1.55521, -1.68062, -0.31776, -0.17779, -2.39883, 0.7233, -1.11902,
-0.84732, -1.27553, 1.24033), Ascore = c(-0.91699, 0.76096, -1.6209,
0.59042, -0.30172, 2.03972, -0.30172, -0.30172, 0.76096, 0.59042,
-0.76096, -1.92595, -1.6209, 0.94156, -0.60633, -1.07533, 0.28783,
-0.45321, -1.92595, -1.92595, 1.61108, -0.60633, 1.11406, 0.28783,
0.76096), Cscore = c(-0.00665, -0.14277, -1.0145, 0.58489, 1.30612,
1.63088, 0.93949, 1.63088, 1.13407, 0.12331, 1.81175, -0.52745,
-0.78155, 3.46436, 1.63088, 1.13407, 0.7583, -1.38502, -1.5184,
0.7583, -1.13788, 1.81175, -0.89891, -1.0145, 1.46191), Impulsivity = c(-0.21712,
-0.71126, -1.37983, -1.37983, -0.21712, -1.37983, -0.21712, 0.19268,
-1.37983, -1.37983, 0.19268, 0.52975, 1.29221, -0.71126, 1.29221,
-0.71126, -0.21712, -1.37983, -0.71126, -1.37983, 0.19268, -0.21712,
-0.71126, -1.37983, -0.21712), SS = c(-1.18084, -0.21575, 0.40148,
-1.18084, -0.21575, -1.54858, 0.07987, -0.52593, -1.54858, -0.84637,
0.07987, 1.2247, 0.07987, -0.84637, 0.7654, -0.52593, -2.07848,
-0.84637, -0.21575, -2.07848, -0.21575, -1.18084, 0.07987, -1.54858,
-0.52593), Heroin = c("CL0", "CL0", "CL0", "CL0", "CL0", "CL0",
"CL0", "CL0", "CL0", "CL0", "CL0", "CL0", "CL0", "CL0", "CL0",
"CL0", "CL0", "CL0", "CL0", "CL0", "CL0", "CL0", "CL0", "CL1",
"CL0"), User = c("No", "No", "No", "No", "No", "No", "No", "No",
"No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No",
"No", "No", "No", "No", "No", "No")), row.names = c(NA, -25L), class = c("tbl_df",
"tbl", "data.frame"))
matrix - non negative enteries
structure(c(0.610545701853111, 0.96447451044118, 0.851040908078699,
0.589544977621028, 0.827711005318391, 0.904054295338232, 0.589544977621028,
0.456502942067452, 0.688702581355738, 0.533249474881585, 0.389562632031922,
0.456502942067452, 0.632276676717644, 0.436140184047234, 0.653498081085907,
0.166581403886779, 0.495545270523754, 0.476097331977154, 0.671587833703474,
0.533249474881585, 0.436140184047234, 0.71082275573317, 0.610545701853111,
0.550877805396231, 0.495545270523754, 0.606851082126989, 0.606851082126989,
0.529890274393643, 0.17257337601325, 0.40854553851544, 0.238467663593076,
0.327676651856099, 0.580185689487682, 0.367919991081809, 0.479144172121096,
1, 0.505317471692705, 0.453951873616486, 0.0885974551303489,
0.259420636058159, 0.505317471692705, 0.63189368241683, 0.386973070246684,
0, 0.327676651856099, 0.479144172121096, 0.327676651856099, 0.386973070246684,
0.350233306260252, 0.63189368241683, 0.535574626985272, 0.563618843854673,
0.750572944212062, 0.223135727679215, 0.4774929083129, 0.706259916342132,
0.355063144061413, 0.275029248201035, 0.667195538247031, 0.563618843854673,
0.850078930076767, 0.223135727679215, 0.796364408545283, 0.163192941968364,
0.223135727679215, 0.355063144061413, 0.163192941968364, 0.595629597576807,
0.4774929083129, 0.223135727679215, 0.535574626985272, 0.706259916342132,
0.508037245380387, 0.383103354327933, 0.414857685465647, 0.403077115158141,
0.233825658261037, 0.558377116170461, 0.712895943634026, 0.210211608610118,
0.285534960468907, 0.658493870402802, 0.429792238205629, 0.533600585795802,
0.210211608610118, 0.482206789967235, 0.684122436721568, 0.285534960468907,
0.456117618078684, 0.533600585795802, 0.508041194672498, 0.18731799331194,
0.456117618078684, 0.357934394919504, 0.712895943634026, 0.558377116170461,
0.558377116170461, 0.156244832950339, 0.581589612247721, 0.658493870402802,
0.318731025010843, 0.202385427208327, 0.202385427208327, 0.249965664305335,
0.406440653462484, 0.599528335983808, 0.498561515107706, 0.571262830706954,
0.571262830706954, 0.788549949400029, 0.694141246205002, 0.694141246205002,
0.729304611825936, 0.474472314587249, 0.429044383403209, 0.694141246205002,
0.36191629319069, 0.474472314587249, 0.154790732976724, 0.474472314587249,
0.429044383403209, 0.297072430244326, 0.474472314587249, 0.154790732976724,
0.406440653462484, 0.446013490788146, 0.757189169998879, 1, 0,
0.528084009118427, 0.446013490788146, 0.271569845659404, 0.624098434171681,
0.271569845659404, 0.528084009118427, 0.757189169998879, 0.528084009118427,
0.446013490788146, 0.367285305878396, 0.156155405657909, 0.271569845659404,
0.624098434171681, 0.757189169998879, 0.528084009118427, 0, 0.367285305878396,
0.528084009118427, 0.528084009118427, 0.156155405657909, 0.528084009118427,
0.825751648038478, 1, 0.619957452233758, 0.308011329405206, 0.619957452233758,
0.53955917314341, 0.710932676034508, 0.46565805295222, 0.53955917314341,
1, 0.825751648038478, 0.619957452233758, 0.825751648038478, 0.710932676034508,
0.619957452233758, 0.53955917314341, 1, 0.710932676034508, 0.710932676034508,
0.825751648038478, 0.825751648038478, 0.619957452233758, 0.46565805295222,
0, 0.825751648038478), dim = c(25L, 7L), dimnames = list(c("30",
"67", "115", "20", "3", "18", "16", "8", "10", "77", "84", "71",
"54", "64", "85", "105", "58", "2", "102", "17", "79", "87",
"63", "11", "39"), c("Nscore", "Escore", "Oscore", "Ascore",
"Cscore", "Impulsivity", "SS")))
I have a dataframe such as :
> head(tab)
molecule gene start end strand orientation hatch
1 Genome5 genA 405113 407035 forward -1 NO
2 Genome5 genB 407035 407916 forward -1 NO
3 Genome5 genC 407927 408394 forward -1 NO
4 Genome5 genD 408387 408737 reverse -1 NO
5 Genome5 genE 408751 409830 forward 1 NO
6 Genome5 genF 409836 410315 forward -1 NO
And I can produce the following plot using this script :
library(ggplot2)
library(gggenes)
ggplot(example_genes, aes(xmin = start, xmax = end, y = molecule, fill = gene)) +
geom_gene_arrow() +
facet_wrap(~ molecule, scales = "free", ncol = 1) +
scale_fill_brewer(palette = "Set3")
And I would like simply to add hatchs on genes containing hatch=="YES" within the tab$hatch column in the tab
And then get the following result ;
Here is the dput format of the table :
structure(list(molecule = c("Genome5", "Genome5", "Genome5",
"Genome5", "Genome5", "Genome5", "Genome5", "Genome5", "Genome5",
"Genome5", "Genome3", "Genome3", "Genome3", "Genome3", "Genome3",
"Genome3", "Genome3", "Genome3", "Genome4", "Genome4", "Genome4",
"Genome4", "Genome4", "Genome4", "Genome2", "Genome2", "Genome2",
"Genome2", "Genome2", "Genome2", "Genome2", "Genome2", "Genome1",
"Genome1", "Genome1", "Genome1", "Genome1", "Genome1", "Genome1",
"Genome1", "Genome1", "Genome1", "Genome6", "Genome6", "Genome6",
"Genome6", "Genome6", "Genome6", "Genome6", "Genome6", "Genome7",
"Genome7", "Genome7", "Genome7", "Genome7", "Genome7", "Genome7",
"Genome7", "Genome7", "Genome7", "Genome7", "Genome8", "Genome8",
"Genome8", "Genome8", "Genome8", "Genome8", "Genome8", "Genome8",
"Genome8", "Genome8", "Genome8"), gene = c("genA", "genB", "genC",
"genD", "genE", "genF", "protF", "protC", "protD", "protE", "genA",
"genB", "genC", "genD", "genE", "genF", "protA", "protB", "genA",
"genB", "genC", "genD", "genE", "genF", "genA", "genB", "genC",
"genD", "genE", "genF", "protA", "protB", "genA", "genB", "genC",
"genD", "genE", "genF", "protF", "protC", "protD", "protE", "genA",
"genB", "genC", "genD", "genE", "genF", "protA", "protB", "genB",
"genC", "genD", "genE", "genF", "protA", "protB", "protF", "protC",
"protD", "protE", "genB", "genC", "genD", "genE", "genF", "protA",
"protB", "protF", "protC", "protD", "protE"), start = c(405113,
407035, 407927, 408387, 408751, 409836, 410335, 412621, 412830,
413867, -67849, -65867, -64997, -64507, -64127, -63011, -62550,
-62187, -47353, -45431, -44522, -44070, -43701, -42614, 8345,
10327, 11394, 11878, 12258, 13365, 13726, 14260, 15389, 17301,
18176, 18641, 18999, 20086, 20474, 22777, 22986, 24024, 65751,
67698, 68605, 69128, 69501, 70614, 71008, 71375, -9390, -8984,
-8500, -8130, -7019, -6662, -6306, -5695, -3446, -3188, -2116,
2, 413, 898, 1268, 2376, 2733, 3089, 3700, 5949, 6217, 7307),
end = c(407035, 407916, 408394, 408737, 409830, 410315, 412596,
412833, 413870, 414850, -65864, -65013, -64548, -64127, -63048,
-62640, -62209, -61549, -45443, -44571, -44070, -43723, -42625,
-42201, 10330, 11181, 11843, 12255, 13337, 13733, 14067,
14919, 17299, 18161, 18640, 18985, 20078, 20451, 22720, 22989,
24023, 25010, 67691, 68570, 69135, 69511, 70583, 71015, 71349,
72034, -8992, -8511, -8123, -7048, -6663, -6321, -5653, -3449,
-3207, -2136, -1127, 406, 886, 1275, 2350, 2732, 3074, 3742,
5946, 6182, 7269, 8296), strand = c("forward", "forward",
"forward", "reverse", "forward", "forward", "reverse", "forward",
"forward", "forward", "reverse", "reverse", "reverse", "forward",
"reverse", "reverse", "reverse", "reverse", "reverse", "reverse",
"forward", "reverse", "forward", "forward", "forward", "forward",
"forward", "forward", "forward", "reverse", "forward", "reverse",
"reverse", "forward", "reverse", "forward", "reverse", "forward",
"forward", "forward", "forward", "forward", "forward", "forward",
"reverse", "forward", "reverse", "forward", "forward", "forward",
"reverse", "forward", "reverse", "reverse", "forward", "reverse",
"forward", "forward", "reverse", "reverse", "forward", "forward",
"forward", "forward", "forward", "forward", "reverse", "forward",
"forward", "reverse", "reverse", "reverse"), orientation = c(-1,
-1, -1, -1, 1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, 1,
-1, 1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1, 1, 1, 1, 1, -1,
1, -1, 1, 1, -1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1,
1, -1, -1, -1, -1, 1, -1, -1, 1, 1, 1, 1, 1, 1, -1, -1, -1,
-1, -1, 1, 1, -1), hatch = c("NO", "NO", "NO", "NO", "NO",
"NO", "NO", "NO", "NO", "NO", "YES", "NO", "NO", "NO", "NO",
"NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",
"NO", "NO", "NO", "NO", "NO", "NO", "NO", "YES", "NO", "NO",
"NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",
"NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",
"NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO",
"NO", "NO", "NO", "NO", "NO", "NO", "NO")), row.names = c(NA,
-72L), class = "data.frame")
Problem:
I can't find the right way to make a plot with values from a given variable with points and plot the value of the mean with another different shape. So far I find a way of doing this, but mean value appears in the color legend also which is something I don't want to. How could I get the desired output? Should I use stat_summary?
NOTE: Variables must be ordered by the mean value among groups by multimorbidity (if it is something important for the solution proposed) this is why I am using reorder_within and scale_x_reordered.
source("https://raw.githubusercontent.com/dgrtwo/drlib/master/R/reorder_within.R")
library(tidyverse)
foo %>%
group_by(multimorbidity, variables) %>%
mutate(Mean = mean(varimportance),
aux_mean = Mean) %>%
ungroup() %>%
spread(Gender, varimportance) %>%
gather(Gender, varimportance, -multimorbidity, -variables, -aux_mean) %>%
mutate(type = if_else(Gender %in% c("Male", "Female"), "Gender", "Mean")) %>%
ggplot(aes(reorder_within(variables, aux_mean, multimorbidity), varimportance,
color = Gender, shape = type)) +
geom_point() +
scale_x_reordered() +
scale_shape_manual(values = c(21, 24)) +
coord_flip() +
facet_wrap(multimorbidity~., scales = "free")
Created on 2019-03-20 by the reprex package (v0.2.1)
The desired output:
dput for foo:
foo <- structure(list(
Gender = c(
"Male", "Male", "Male", "Male", "Male",
"Female", "Female", "Female", "Female", "Female", "Female", "Female",
"Female", "Female", "Female", "Male", "Male", "Male", "Male",
"Male"
), multimorbidity = c(
"Yes", "Yes", "Yes", "Yes", "Yes",
"No", "No", "No", "No", "No", "Yes", "Yes", "Yes", "Yes", "Yes",
"No", "No", "No", "No", "No"
), variables = c(
"bmi", "income",
"soccap", "alternattr", "occhaz", "bmi", "income", "soccap",
"alternattr", "occhaz", "bmi", "income", "soccap", "alternattr",
"occhaz", "bmi", "income", "soccap", "alternattr", "occhaz"
),
varimportance = c(
73.1234145437324, 51.0029811829917, 100,
0, 90.9926659603591, 81.1949541852942, 48.2402164701156,
100, 0, 9.10509052698692, 66.7759248406279, 31.69991730502,
100, 4.7914221037359, 93.4636133674693, 70.8853809607131,
75.004433319282, 100, 0, 43.7326141975936
)
), class = c(
"tbl_df",
"tbl", "data.frame"
), row.names = c(NA, -20L))
I am new to R and unable to calculate the entropy.
There is a similar question on stackoverflow with the answer but i wanted to know why this code isn't working. Here is the copy paste data from the same question.
One of the answer mentions, "The part I think you are missing is the calculation of the class frequencies and you will get your answer", but how do i fix this. I tried most of the options but still i don't get any output. It just runs without any errors.
info <- function(CLASS.FREQ){
freq.class <- CLASS.FREQ
info <- 0
for(i in 1:length(freq.class)){
if(freq.class[[i]] != 0){ # zero check in class
entropy <- -sum(freq.class[[i]] * log2(freq.class[[i]])) #I calculate the entropy for each class i here
}else{
entropy <- 0
}
info <- info + entropy # sum up entropy from all classes
}
return(info)
}
Dataset as below,
buys <- c("no", "no", "yes", "yes", "yes", "no", "yes", "no", "yes", "yes", "yes", "yes", "yes", "no")
credit <- c("fair", "excellent", "fair", "fair", "fair", "excellent", "excellent", "fair", "fair", "fair", "excellent", "excellent", "fair", "excellent")
student <- c("no", "no", "no","no", "yes", "yes", "yes", "no", "yes", "yes", "yes", "no", "yes", "no")
income <- c("high", "high", "high", "medium", "low", "low", "low", "medium", "low", "medium", "medium", "medium", "high", "medium")
age <- c(25, 27, 35, 41, 48, 42, 36, 29, 26, 45, 23, 33, 37, 44)
we change the age from categorical to numeric
Cheers, Jack
You need to calculate the propertion of "no" and "yes" in "buys", the proportion of "fair" and "excellent" in "credit", and so on. Here is one way to do it:
data <- list(
buys = c("no", "no", "yes", "yes", "yes", "no", "yes", "no", "yes", "yes", "yes", "yes", "yes", "no"),
credit = c("fair", "excellent", "fair", "fair", "fair", "excellent", "excellent", "fair", "fair", "fair", "excellent", "excellent", "fair", "excellent"),
student = c("no", "no", "no","no", "yes", "yes", "yes", "no", "yes", "yes", "yes", "no", "yes", "no"),
income = c("high", "high", "high", "medium", "low", "low", "low", "medium", "low", "medium", "medium", "medium", "high", "medium"),
age = c(25, 27, 35, 41, 48, 42, 36, 29, 26, 45, 23, 33, 37, 44)
)
freq <- lapply( data, function(x){rowMeans(outer(unique(x),x,"=="))})
.
> freq
$buys
[1] 0.3571429 0.6428571
$credit
[1] 0.5714286 0.4285714
$student
[1] 0.5 0.5
$income
[1] 0.2857143 0.4285714 0.2857143
$age
[1] 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857
[14] 0.07142857
Such a proportion can never be 0. So change if(freq.class[[i]] != 0){ # zero check in class to if(length(freq.class[[i]]) != 0){ # zero check in class:
info <- function(CLASS.FREQ){
freq.class <- CLASS.FREQ
info <- 0
for(i in 1:length(freq.class)){
if(length(freq.class[[i]]) != 0){ # zero check in class
entropy <- -sum(freq.class[[i]] * log2(freq.class[[i]])) #I calculate the entropy for each class i here
}else{
entropy <- 0
}
info <- info + entropy # sum up entropy from all classes
}
return(info)
}
.
> info(freq)
[1] 8.289526
> info(freq$buys)
[1] 0.940286
> info(freq$age)
[1] 3.807355
>
I've tried for several hours to calculate the Entropy and I know I'm missing something. Hopefully someone here can give me an idea!
EDIT: I think my formula is wrong!
CODE:
info <- function(CLASS.FREQ){
freq.class <- CLASS.FREQ
info <- 0
for(i in 1:length(freq.class)){
if(freq.class[[i]] != 0){ # zero check in class
entropy <- -sum(freq.class[[i]] * log2(freq.class[[i]])) #I calculate the entropy for each class i here
}else{
entropy <- 0
}
info <- info + entropy # sum up entropy from all classes
}
return(info)
}
I hope my post is clear, since it's the first time I actually post here.
This is my dataset:
buys <- c("no", "no", "yes", "yes", "yes", "no", "yes", "no", "yes", "yes", "yes", "yes", "yes", "no")
credit <- c("fair", "excellent", "fair", "fair", "fair", "excellent", "excellent", "fair", "fair", "fair", "excellent", "excellent", "fair", "excellent")
student <- c("no", "no", "no","no", "yes", "yes", "yes", "no", "yes", "yes", "yes", "no", "yes", "no")
income <- c("high", "high", "high", "medium", "low", "low", "low", "medium", "low", "medium", "medium", "medium", "high", "medium")
age <- c(25, 27, 35, 41, 48, 42, 36, 29, 26, 45, 23, 33, 37, 44) # we change the age from categorical to numeric
Ultimately I find no error in your code as it runs without error. The part I think you are missing is the calculation of the class frequencies and you will get your answer. Quickly running through the different objects you provide I suspect you are looking at buys.
buys <- c("no", "no", "yes", "yes", "yes", "no", "yes", "no", "yes", "yes", "yes", "yes", "yes", "no")
freqs <- table(buys)/length(buys)
info(freqs)
[1] 0.940286
As a matter of improving your code, you can simplify this dramatically as you don't need a loop if you are provided a vector of class frequencies.
For example:
# calculate shannon-entropy
-sum(freqs * log2(freqs))
[1] 0.940286
As a side note, the function entropy.empirical is in the entropy package where you set the units to log2 allowing some more flexibility. Example:
entropy.empirical(freqs, unit="log2")
[1] 0.940286
There is an another way similar to above answer but using a different function.
> buys <- c("no", "no", "yes", "yes", "yes", "no", "yes", "no", "yes", "yes", "yes", "yes", "yes", "no")
> probabilities <- prop.table(table(buys))
> probabilities
buys
no yes
0.3571429 0.6428571
> -sum(probabilities*log2(probabilities))
[1] 0.940286
Also there is a built in function entropy.empirical(probabilities, unit = "log2")