to calculate the Entropy - r

I am new to R and unable to calculate the entropy.
There is a similar question on stackoverflow with the answer but i wanted to know why this code isn't working. Here is the copy paste data from the same question.
One of the answer mentions, "The part I think you are missing is the calculation of the class frequencies and you will get your answer", but how do i fix this. I tried most of the options but still i don't get any output. It just runs without any errors.
info <- function(CLASS.FREQ){
freq.class <- CLASS.FREQ
info <- 0
for(i in 1:length(freq.class)){
if(freq.class[[i]] != 0){ # zero check in class
entropy <- -sum(freq.class[[i]] * log2(freq.class[[i]])) #I calculate the entropy for each class i here
}else{
entropy <- 0
}
info <- info + entropy # sum up entropy from all classes
}
return(info)
}
Dataset as below,
buys <- c("no", "no", "yes", "yes", "yes", "no", "yes", "no", "yes", "yes", "yes", "yes", "yes", "no")
credit <- c("fair", "excellent", "fair", "fair", "fair", "excellent", "excellent", "fair", "fair", "fair", "excellent", "excellent", "fair", "excellent")
student <- c("no", "no", "no","no", "yes", "yes", "yes", "no", "yes", "yes", "yes", "no", "yes", "no")
income <- c("high", "high", "high", "medium", "low", "low", "low", "medium", "low", "medium", "medium", "medium", "high", "medium")
age <- c(25, 27, 35, 41, 48, 42, 36, 29, 26, 45, 23, 33, 37, 44)
we change the age from categorical to numeric
Cheers, Jack

You need to calculate the propertion of "no" and "yes" in "buys", the proportion of "fair" and "excellent" in "credit", and so on. Here is one way to do it:
data <- list(
buys = c("no", "no", "yes", "yes", "yes", "no", "yes", "no", "yes", "yes", "yes", "yes", "yes", "no"),
credit = c("fair", "excellent", "fair", "fair", "fair", "excellent", "excellent", "fair", "fair", "fair", "excellent", "excellent", "fair", "excellent"),
student = c("no", "no", "no","no", "yes", "yes", "yes", "no", "yes", "yes", "yes", "no", "yes", "no"),
income = c("high", "high", "high", "medium", "low", "low", "low", "medium", "low", "medium", "medium", "medium", "high", "medium"),
age = c(25, 27, 35, 41, 48, 42, 36, 29, 26, 45, 23, 33, 37, 44)
)
freq <- lapply( data, function(x){rowMeans(outer(unique(x),x,"=="))})
.
> freq
$buys
[1] 0.3571429 0.6428571
$credit
[1] 0.5714286 0.4285714
$student
[1] 0.5 0.5
$income
[1] 0.2857143 0.4285714 0.2857143
$age
[1] 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.07142857
[14] 0.07142857
Such a proportion can never be 0. So change if(freq.class[[i]] != 0){ # zero check in class to if(length(freq.class[[i]]) != 0){ # zero check in class:
info <- function(CLASS.FREQ){
freq.class <- CLASS.FREQ
info <- 0
for(i in 1:length(freq.class)){
if(length(freq.class[[i]]) != 0){ # zero check in class
entropy <- -sum(freq.class[[i]] * log2(freq.class[[i]])) #I calculate the entropy for each class i here
}else{
entropy <- 0
}
info <- info + entropy # sum up entropy from all classes
}
return(info)
}
.
> info(freq)
[1] 8.289526
> info(freq$buys)
[1] 0.940286
> info(freq$age)
[1] 3.807355
>

Related

Editing loop to append output by variable

The script below is used to create a loop, create certain outputs and separately export them. I would like to modify the script to append the variables of interest by their stratifiers - i.e. combine health_mental, overall_health and outlook_life outputs when they're stratified by age, and another output when they're stratified by sex. I would like to then export those files out in excel format. Thanks in advance for the help!
library(tidyverse)
library(dplyr)
df <- data.frame (overall_health = c("poor", "good", "excellent", "poor", "good", "poor", "poor", "excellent"),
outlook_life = c("good", "excellent", "excellent", "poor", "excellent", "poor", "excellent", "poor"),
health_mental = c("poor", "poor", "excellent", "poor", "poor", "poor", "excellent", "good"),
sex = c("F", "M", "M", "F", "F", "M", "F", "M"),
age_group = c("50-54", "60-64", "80+", "70-74", "40-44", "45-49", "60-64", "65-69"),
income = c("$<40,000", "$50,000-79,000", "$80,000-110,000", "$111,000+", "$<40,000", "$<40,000", "$50,000-79,000", "$80,000-110,000"),
education = c("HS", "College", "Bachelors", "Masters", "HS", "College", "Bachelors", "Masters"),
geography= c("area1", "area2", "area1", "area2", "area2", "area1", "area2", "area1"))
geos <- unique(df$geography)
vars <- c("health_mental", "overall_health", "outlook_life")
combinations <- expand.grid(c("age_group", "sex"), vars, stringsAsFactors = F)
combinations$label <- paste(combinations$Var1, combinations$Var2, sep = "_")
output <- list()
for (geo in geos){
for (combo in 1:nrow(combinations)){
output_label <- paste(combinations[[combo,"label"]],geo,sep="_")
temp <- df %>%
filter(geography == geo) %>%
group_by_at(combinations[combo,1:2] %>% unlist()%>%unname()) %>%
summarise(count = n(),
total = nrow(.),
proportion = count/nrow(.) *100)
output[[output_label]] <- temp
}
}
Expected output - example

what R Code to calculate the entropy for each level in a categorical variable

I have quite some categorical variable in my dataset, These variables have more than two levels each. Now i want an R code function (or loop) that can calculate the entropy and information gain for each levels in each categorical variable and return the lowest entropy and highest information gain.
data <- list(buys = c("no", "no", "yes", "yes", "yes", "no", "yes", "no", "yes", "yes", "yes", "yes", "yes", "no"),credit = c("fair", "excellent", "fair", "fair", "fair", "excellent", "excellent", "fair", "fair", "fair", "excellent", "excellent", "fair", "excellent"),student = c("no", "no", "no","no", "yes", "yes", "yes", "no", "yes", "yes", "yes", "no", "yes", "no"),income = c("high", "high", "high", "medium", "low", "low", "low", "medium", "low", "medium", "medium", "medium", "high", "medium"),age = c(25, 27, 35, 41, 48, 42, 36, 29, 26, 45, 23, 33, 37, 44))
data<- as.data.frame(data)
Above is a sample dataframe
entropy_tab <- function(x) { tabfun2 <- prop.table(table(data[,x],training_credit_Risk[,13]) + 1e-6, margin = 1)sum(prop.table(table(data[,x]))*rowSums(-tabfun2*log2(tabfun2)))}
Above function calculates entropy for each variable, i want a fuction to calculate the contribution to the entropy for each level? i.e the contribution of "excellent" and "fair" to the entropy of "Credit"
In measure theory, the expected surprisal of an event A in a measure space with measure mu is
-mu(A)log(mu(A))
And so the entropy is the sum over all events of the expected surprisal. So what you're looking for is the expected surprisal of each level of each variable.
Note you won't be able to express the surprisal of a data frame as a data frame, as each variable in the data frame has a different number of levels.
You can do
exp_surprisal <- function(x, base=exp(1)) {
t <- table(x)
freq <- t/sum(t)
ifelse(freq==0, 0, -freq * log(freq, base))
}
And then
lapply(data, exp_surprisal)
gives
$buys
x
no yes
0.3677212 0.2840353
$credit
x
excellent fair
0.3631277 0.3197805
$student
x
no yes
0.3465736 0.3465736
$income
x
high low medium
0.3579323 0.3579323 0.3631277
$age
x
23 25 26 27 29 33 35 36 37 41 42 44 45 48
0.1885041 0.1885041 0.1885041 0.1885041 0.1885041 0.1885041 0.1885041 0.1885041 0.1885041 0.1885041 0.1885041 0.1885041 0.1885041 0.1885041
Note you can also define
entropy <- function(x) sum(exp_surprisal(x))
to get the entropy.
Then
lapply(data, entropy)
gives
$buys
[1] 0.6517566
$credit
[1] 0.6829081
$student
[1] 0.6931472
$income
[1] 1.078992
$age
[1] 2.639057
You have to modify your function to have two inputs, the variable you want and the level of the variable. Inside the function you then have to subset based on the level of the variable you want. I then use mapply to loop through the variable credit and each of its levels.
entropy_tab <- function(x,y) {
tabfun2 <- prop.table(table(data[,x][data[,x] == y] ,data[,5][data[,x]==y]) + 1e-6, margin = 1)
sum(prop.table(table(data[,x][data[,x] == y]))*rowSums(-tabfun2*log2(tabfun2)))
}
x <- mapply(entropy_tab, c("credit","credit"), unique(data$credit))
names(x) <- unique(data$credit)
#checks
entropy_tab("credit","excellent")
entropy_tab("credit","fair")

Cannot make a logistic curve in R

I am trying to make a logistic curve in R but the line does not appear in the plot.
My data are:
dput(los1)
structure(list(X1 = c("5.51688462301445", "2.55660506920185",
"4.17130300764484", "15.0032350113684", "0.0672790807684578",
"0", "10.7646529229551", "1.6819770192119", "4.44041933071867",
"2.69116323073877", "0", "0.740069888453036", "1.54741885767498",
"0.201837242305373", "1.81653518074882", "6.12239634993057",
"3.49851219996026", "22.4039338958996", "0.538232646147662",
"0.134558161536916", "1.2783025346007", "1.6819770192119", "16.9543283536541",
"60.0129400454734", "9.62090854989083", "0.470953565379205",
"33.7740985457708", "6.8624662383836", "0", "0", "4.50769841148758",
"62.6368241954438", "264.137671097005", "14.5995605267576", "0",
"0", "0", "6.12239634993057", "10.1591411960385", "22.9421665420477",
"0.470953565379205", "2.28748874612802", "13.8594906383046",
"11.0337692460289", "18.6363053728655", "27.2480277112295", "0.0672790807684578",
"0.470953565379205", "0", "0"), X2 = c("No", "No", "Yes", "No",
"No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No",
"No", "No", "No", "No", "No", "No", "No", "No", "Yes", "Yes",
"No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No",
"No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No",
"No", "No", "No")), row.names = c(NA, 50L), class = "data.frame")
and the code I use for the curve is:
los1 %>%
mutate(prob = ifelse(X2 == "Yes", 1, 0)) %>%
ggplot(aes(X1, prob)) +
geom_point(alpha = 0.2) +
geom_smooth(method = "lm", se=FALSE, method.args = list(family = "binomial")) +
labs(
title = "Logistic Regression Model",
x = "Plasma Glucose Concentration",
y = "Probability of being diabete-pos"
)
Any idea how to make the curve?
Another option using stat_smooth with "glm" method and "X1" converted to numeric like this:
library(tidyverse)
los1 %>%
mutate(prob = ifelse(X2 == "Yes", 1, 0)) %>%
mutate(X1 = as.numeric(X1)) %>%
ggplot(aes(X1, prob)) +
geom_point(alpha = 0.2) +
stat_smooth(method="glm", color="green", se=FALSE, method.args = list(family=binomial)) +
labs(
title = "Logistic Regression Model",
x = "Plasma Glucose Concentration",
y = "Probability of being diabete-pos"
)
#> `geom_smooth()` using formula 'y ~ x'
Created on 2022-08-30 with reprex v2.0.2
los1 <- structure(list(X1 = c("5.51688462301445", "2.55660506920185",
"4.17130300764484", "15.0032350113684", "0.0672790807684578",
"0", "10.7646529229551", "1.6819770192119", "4.44041933071867",
"2.69116323073877", "0", "0.740069888453036", "1.54741885767498",
"0.201837242305373", "1.81653518074882", "6.12239634993057",
"3.49851219996026", "22.4039338958996", "0.538232646147662",
"0.134558161536916", "1.2783025346007", "1.6819770192119", "16.9543283536541",
"60.0129400454734", "9.62090854989083", "0.470953565379205",
"33.7740985457708", "6.8624662383836", "0", "0", "4.50769841148758",
"62.6368241954438", "264.137671097005", "14.5995605267576", "0",
"0", "0", "6.12239634993057", "10.1591411960385", "22.9421665420477",
"0.470953565379205", "2.28748874612802", "13.8594906383046",
"11.0337692460289", "18.6363053728655", "27.2480277112295", "0.0672790807684578",
"0.470953565379205", "0", "0"), X2 = c("No", "No", "Yes", "No",
"No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No",
"No", "No", "No", "No", "No", "No", "No", "No", "Yes", "Yes",
"No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No",
"No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No",
"No", "No", "No")), row.names = c(NA, 50L), class = "data.frame")
str(los1)
los1$X1 <- as.numeric(los1$X1)
los1$Y <- ifelse(los1$X2 == "Yes", 1, 0)
library(ggplot2)
los1 |>
ggplot(aes(X1, Y)) +
geom_point(alpha = 0.2) +
geom_smooth(method = "glm", se=FALSE, method.args = list(family = "binomial")) +
labs(
title = "Logistic Regression Model",
x = "Plasma Glucose Concentration",
y = "Probability of being diabete-pos"
)

Calculating Entropy

I've tried for several hours to calculate the Entropy and I know I'm missing something. Hopefully someone here can give me an idea!
EDIT: I think my formula is wrong!
CODE:
info <- function(CLASS.FREQ){
freq.class <- CLASS.FREQ
info <- 0
for(i in 1:length(freq.class)){
if(freq.class[[i]] != 0){ # zero check in class
entropy <- -sum(freq.class[[i]] * log2(freq.class[[i]])) #I calculate the entropy for each class i here
}else{
entropy <- 0
}
info <- info + entropy # sum up entropy from all classes
}
return(info)
}
I hope my post is clear, since it's the first time I actually post here.
This is my dataset:
buys <- c("no", "no", "yes", "yes", "yes", "no", "yes", "no", "yes", "yes", "yes", "yes", "yes", "no")
credit <- c("fair", "excellent", "fair", "fair", "fair", "excellent", "excellent", "fair", "fair", "fair", "excellent", "excellent", "fair", "excellent")
student <- c("no", "no", "no","no", "yes", "yes", "yes", "no", "yes", "yes", "yes", "no", "yes", "no")
income <- c("high", "high", "high", "medium", "low", "low", "low", "medium", "low", "medium", "medium", "medium", "high", "medium")
age <- c(25, 27, 35, 41, 48, 42, 36, 29, 26, 45, 23, 33, 37, 44) # we change the age from categorical to numeric
Ultimately I find no error in your code as it runs without error. The part I think you are missing is the calculation of the class frequencies and you will get your answer. Quickly running through the different objects you provide I suspect you are looking at buys.
buys <- c("no", "no", "yes", "yes", "yes", "no", "yes", "no", "yes", "yes", "yes", "yes", "yes", "no")
freqs <- table(buys)/length(buys)
info(freqs)
[1] 0.940286
As a matter of improving your code, you can simplify this dramatically as you don't need a loop if you are provided a vector of class frequencies.
For example:
# calculate shannon-entropy
-sum(freqs * log2(freqs))
[1] 0.940286
As a side note, the function entropy.empirical is in the entropy package where you set the units to log2 allowing some more flexibility. Example:
entropy.empirical(freqs, unit="log2")
[1] 0.940286
There is an another way similar to above answer but using a different function.
> buys <- c("no", "no", "yes", "yes", "yes", "no", "yes", "no", "yes", "yes", "yes", "yes", "yes", "no")
> probabilities <- prop.table(table(buys))
> probabilities
buys
no yes
0.3571429 0.6428571
> -sum(probabilities*log2(probabilities))
[1] 0.940286
Also there is a built in function entropy.empirical(probabilities, unit = "log2")

Coloring the states according to a given variable using ggplot

I have plotted the map as follows. I need help with adding some extra layer. I know there are many ways to do this, but as a newbie to ggplot I am really clueless beyond this point. Any help would be much appreciated.thanks
I have used the code:
shp_state1<-readShapeSpatial("...")
mapindia<-fortify(shp_state1)
q=ggplot(data=mapindia, aes(long, lat, group=group,colour)) + geom_polygon(fill="#FF9999", colour="black") +coord_map()
now i have the following data(as data frame):
Row.Labels LATITUDE LONGITUDE MAJORITY
1 Andhra Pradesh 78.30000 17.200001 Yes
2 ASSAM 91.50000 26.090000 No
3 Bihar 85.13000 25.370001 No
4 Chandigarh 76.79855 30.744196 No
5 CHHATTISGARH 81.63000 21.230000 No
6 DADRA & NAGAR HAVELI 72.96667 20.266666 Yes
7 DAMAN & DIU 72.80640 20.251890 No
8 Delhi 72.80640 20.251890 No
9 GOA 73.96992 15.384293 Yes
10 GUJARAT 72.40000 23.030001 No
11 Haryana 75.95947 29.017748 No
12 Himachal 75.95947 29.017748 No
13 Jharkhand 85.33000 23.350000 No
14 KARNATAKA 75.68481 14.849231 Yes
15 KERALA 76.82739 9.470736 Yes
16 MAHARASHTRA 75.64087 19.590844 Yes
17 MANIPUR 93.58000 24.440001 No
18 MEGHALAYA 91.00000 25.299999 No
19 MP 93.00000 23.299999 No
20 Odissa 77.21067 28.623932 No
21 PONDICHERRY 79.82803 11.937899 No
22 Punjab 75.50000 30.400000 No
23 Rajasthan 75.52000 26.549999 No
24 TAMIL NADU 88.40000 27.200001 Yes
25 TRIPURA 91.25000 23.500000 No
26 UP 91.25000 23.500000 No
27 Uttrakhand 78.20000 30.110001 No
28 WEST BENGAL 88.24000 22.340000 No
data <- structure(list(Row.Labels = c("Andhra Pradesh", "ASSAM", "Bihar",
"Chandigarh", "CHHATTISGARH", "DADRA & NAGAR HAVELI", "DAMAN & DIU",
"Delhi", "GOA", "GUJARAT", "Haryana", "Himachal", "Jharkhand",
"KARNATAKA", "KERALA", "MAHARASHTRA", "MANIPUR", "MEGHALAYA",
"MP", "Odissa", "PONDICHERRY", "Punjab", "Rajasthan", "TAMIL NADU",
"TRIPURA", "UP", "Uttrakhand", "WEST BENGAL"), LATITUDE = c(78.3,
91.5, 85.13, 76.79855, 81.63, 72.96667, 72.8064, 72.8064, 73.96992,
72.4, 75.95947, 75.95947, 85.33, 75.68481, 76.82739, 75.64087,
93.58, 91, 93, 77.21067, 79.82803, 75.5, 75.52, 88.4, 91.25,
91.25, 78.2, 88.24), LONGITUDE = c(17.200001, 26.09, 25.370001,
30.744196, 21.23, 20.266666, 20.25189, 20.25189, 15.384293, 23.030001,
29.017748, 29.017748, 23.35, 14.849231, 9.470736, 19.590844,
24.440001, 25.299999, 23.299999, 28.623932, 11.937899, 30.4,
26.549999, 27.200001, 23.5, 23.5, 30.110001, 22.34), MAJORITY = c("Yes",
"No", "No", "No", "No", "Yes", "No", "No", "Yes", "No", "No",
"No", "No", "Yes", "Yes", "Yes", "No", "No", "No", "No", "No",
"No", "No", "Yes", "No", "No", "No", "No")), .Names = c("Row.Labels",
"LATITUDE", "LONGITUDE", "MAJORITY"), class = "data.frame", row.names = c(NA, -28L))
How can I colour the sates according as "yes" or "no" ?
Considering your data is stored in a data frame named data, here's one way:
library(raster); library(ggplot2)
india <- getData('GADM', country="IND", level=1)
f_india <- fortify(india)
i <- sapply(india#data$NAME_1, function(x) agrep(x, data$Row.Labels, max.distance=.3, ignore.case=T)[1])
india#data$maj <- data$MAJORITY[i]
f_india <- merge(x=f_india, y=unique(india#data), by.x="id", by.y="ID_1",all.x=T)
f_india <- f_india[with(f_india, order(id, order)), ] # to prevent this http://stackoverflow.com/questions/24039621/code-not-working-for-other-shp-files
ggplot(f_india, aes(x=long, y=lat, group=group, fill=maj)) +
geom_polygon(colour="black")
You may want to adjust i which connects the names from your data frame with the names from the map data to pull in the votes into the map data.

Resources