Confidence calculation in association rule - associations

supportData = {('ELF'): 0.75, ('CAT'): 0.75, ('BAT', 'CAT', 'ELF'): 0.5, ('ARK', 'BAT'): 0.25, ('ARK', 'ELF'): 0.25, ('CAT', 'ELF'): 0.5, ('DOG'): 0.25, ('BAT', 'CAT'): 0.5, ('BAT', 'ELF'): 0.75, ('ARK'): 0.5, ('ARK', 'CAT'): 0.5, ('BAT'): 0.75}
L = [('ARK'), ('CAT'), ('CAT'), ('ELF'),('ARK', 'CAT'), ('BAT', 'ELF'), ('BAT', 'CAT'), ('CAT', 'ELF'),('BAT', 'CAT', 'ELF')]
for freqSet in L:
H = list(freqSet)
if len(H) == 1:
pass
else:
for conseq in H:
freqsetlist = list(freqSet)
freqsetlist.remove(conseq)
if len(freqsetlist) == 1:
conf = supportData[freqSet]/supportData[tuple(freqsetlist)[0]]
if conf >= 0.1:
print freqsetlist,'-->',conseq,'conf:',conf
else:
conf = supportData[freqSet]/supportData[tuple(freqsetlist)[:]]
if conf >= 0.1:
print freqsetlist,'-->',conseq,'conf:',conf
KeyError: ('R','K')
Can someone point out why I am getting this error? It seems the error occur when len(freqsetlist) is > 1. That is when calculating tuple with 3 element

When a tuple has only one element, you should put a comma after it, so it will be interpreted as a tuple, and not as a single (parenthesized) element:
'CAT' == ('CAT') != ('CAT',) == tuple(['CAT'])
Since both your supportData and L variables are currently mixing strings and tuples (when you, I guess, only wanted to have tuples), this line of code for instance will never run:
if len(H) == 1:
pass
Update: as for your error message, take the first element, ('ARK'): you first converted it to a list (['A', 'R', 'K']), then removed the first element ('A'), then converted the result back to a tuple - yielding ('R', 'K'). When you tried to look it up in your supportData dict (in the same line of code you mentioned) it wasn't there, thus a KeyError.
Correct your data as I explained, and this particular error will be avoided:
supportData = {('ELF',): 0.75, ('CAT',): 0.75, ('BAT', 'CAT', 'ELF'): 0.5, ('ARK', 'BAT'): 0.25, ('ARK', 'ELF'): 0.25, ('CAT', 'ELF'): 0.5, ('DOG',): 0.25, ('BAT', 'CAT'): 0.5, ('BAT', 'ELF'): 0.75, ('ARK',): 0.5, ('ARK', 'CAT'): 0.5, ('BAT',): 0.75}
L = [('ARK',), ('CAT',), ('CAT',), ('ELF',),('ARK', 'CAT'), ('BAT', 'ELF'), ('BAT', 'CAT'), ('CAT', 'ELF'),('BAT', 'CAT', 'ELF')]

Related

R: label variables based on their prefix

Im new to R and trying to label multiple but not all variables of my data at the same time. Specifically, I want to label the variables starting with "pol". I tried to combine the select and the set_variable_labels command in the following manner:
cp14 <- cp14 %>%
select(matches("pol")) %>%
set_variable_labels(cp14,
labels = "Interest in politics")
I would like all variables that include "pol" to be labelled as "Interest in politics". This however does not work. Any advice on how to do this in a similar or completely different manner is greatly appreciated.
My data looks something like this, but with many more variables:
structure(list(pol_interest_w1 = c(0.5, 0.5, 0.25, 0.25, 0.25,
0.5), pol_interest_w2 = c(0.5, 0.5, 0.25, NA, 0.25, 0.5), pol_interest_w3 = c(0.5,
0.5, 0.25, NA, 0, 0.5), pol_interest_w4 = c(0.5, 0.5, 0.25, NA,
0, 0.5), pol_interest_w5 = c(0.5, 0.5, 0.25, NA, 0, 0.5), pol_interest_w6 = c(0.5,
0.5, 0.25, NA, 0, 0.5), pol_interest_w7 = c(0.5, 0.5, 0.25, NA,
0.25, 0.5), new_col = c(0.75, 0.5, 0.25, NA, 0.25, 0.5)), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
You can do this in a couple ways. For either solution, start by creating a vector of variable names starting with "pol". (I use stringr::str_starts() here; you don’t want to use select(), as in your code, which is for subsetting columns from your dataset.)
library(stringr)
library(labelled)
pol_vars <- names(cp14)[str_starts(names(cp14), "pol")]
Then, you can make a named list of labels, and pass it to the .labels argument of labelled::set_variable_labels().
pol_labels <- setNames(
as.list(rep("Interest in politics", length(pol_vars))),
pol_vars
)
cp14 <- set_variable_labels(cp14, .labels = pol_labels)
Alternatively, you could loop over the variable names and assign labels using labelled::var_label().
for (v in pol_vars) {
var_label(cp14[[v]]) <- "Interest in politics"
}
Both approaches yield the same result:
#> var_label(cp14)
$pol_interest_w1
[1] "Interest in politics"
$pol_interest_w2
[1] "Interest in politics"
$pol_interest_w3
[1] "Interest in politics"
$pol_interest_w4
[1] "Interest in politics"
$pol_interest_w5
[1] "Interest in politics"
$pol_interest_w6
[1] "Interest in politics"
$pol_interest_w7
[1] "Interest in politics"
$new_col
NULL

bnlearn Error: Wrong number of conditional probability distributions

I am learning to work with bnlearn and I keep running into the following error in the last line of my code below:
Error in custom.fit(dag, cpt) : wrong number of conditional probability distributions
What am I doing wrong?
modelstring(dag)= "[s][r][nblw|r][nblg|nblw][mlw|s:r][f|s:r:mlw][mlg|mlw:f]
[mlgr|mlg:nblg]"
###View DAG Specifics
dag
arcs(dag)
nodes(dag)
# Create Levels
State <- c("State0", "State1")
##Create probability distributions given; these are all 2d b/c they have 1 or 2 nodes
cptS <- matrix(c(0.6, 0.4), ncol=2, dimnames=list(NULL, State))
cptR <- matrix(c(0.7, 0.3), ncol=2, dimnames=list(NULL, State))
cptNBLW <- matrix(c(0.95, 0.05, 0.05, 0.95), ncol=2, dimnames=list(NULL, "r"= State))
cptNBLG <- matrix(c(0.9, 0.099999999999999998, 0.2, 0.8), ncol=2, dimnames=list(NULL,
"nblw"=State))
cptMLG <- matrix(c(0.95, 0.05, 0.4, 0.6, 0.2, 0.8, 0.05, 0.95),ncol=2,nrow = 2,
dimnames=list("mlw"= State, "f"=State))
cptMLGR <- matrix(c(0.6,0.4,0.95,0.05,0.2,0.8,0.55,0.45),ncol=2,nrow = 2,
dimnames=list("mlg"= State, "nblg"=State))
cptMLW <-matrix(c(0.95, 0.05, 0.1, 0.9, 0.2, 0.8, 0.01, 0.99), ncol=2,nrow = 2,byrow = TRUE,
dimnames=list("r"= State, "s"=State))
# Build 3-d matrices( becuase you have 3 nodes, you can't use the matrix function; you
have to build it from scratch)
cptF <- c(0.05, 0.95, 0.4, 0.6, 0.9, 0.1, 0.99, 0.01, 0.9, 0.1, 0.95, 0.05, 0.95, 0.05, 0.99,
0.01)
dim(cptF) <- c(2, 2, 2, 2)
dimnames(cptF) <- list("s"=State, "r"=State, "mlw"=State)
###Create CPT Table
cpt <- list(s = cptS, r = cptR, mlw = cptMLW,nblw= cptNBLW,
mlg= cptMLG, nblg= cptNBLG, mlgr= cptMLGR)
# Construct BN network with Conditional Probability Table
S.net <- custom.fit(dag,cpt)
Reference: https://rpubs.com/sarataheri/bnlearnCGM
You have several errors in your CPT definitions. Primarily, you need to make sure that:
the number of probabilities supplied are equal to the product of the number of states in the child and parent nodes,
that the number of dimensions of the matrix/array is equal to the number of parent nodes plus one, for the child node,
the child node should be given in the first dimension when the node dimension is greater than one.
the names given in the dimnames arguments (e.g. the names in dimnames=list(ThisName = ...)) should match the names that were defined in the DAG, in your case with modelstring and in my answer with model2network. (So my earlier suggestion of using dimnames=list(cptNBLW = ...) should be dimnames=list(nblw = ...) to match how node nblw was declared in the model string)
You also did not add node f into your cpt list.
Below is your code with comments where things have been changed. (I have commented out the offending lines and added ones straight after)
library(bnlearn)
dag <- model2network("[s][r][nblw|r][nblg|nblw][mlw|s:r][mlg|mlw:f][mlgr|mlg:nblg][f|s:r:mlw]")
State <- c("State0", "State1")
cptS <- matrix(c(0.6, 0.4), ncol=2, dimnames=list(NULL, State))
cptR <- matrix(c(0.7, 0.3), ncol=2, dimnames=list(NULL, State))
# add child node into first slot of dimnames
cptNBLW <- matrix(c(0.95, 0.05, 0.05, 0.95), ncol=2, dimnames=list(nblw=State, "r"= State))
cptNBLG <- matrix(c(0.9, 0.099999999999999998, 0.2, 0.8), ncol=2, dimnames=list(nblg=State,"nblw"=State))
# Use a 3d array and not matrix, and add child node into dimnames
# cptMLG <- matrix(c(0.95, 0.05, 0.4, 0.6, 0.2, 0.8, 0.05, 0.95),ncol=2,nrow = 2, dimnames=list("mlw"= State, "f"=State))
cptMLG <- array(c(0.95, 0.05, 0.4, 0.6, 0.2, 0.8, 0.05, 0.95),dim=c(2,2,2), dimnames=list(mlg = State, "mlw"= State, "f"=State))
# cptMLGR <- matrix(c(0.6,0.4,0.95,0.05,0.2,0.8,0.55,0.45),ncol=2,nrow = 2, dimnames=list("mlg"= State, "nblg"=State))
cptMLGR <- array(c(0.6,0.4,0.95,0.05,0.2,0.8,0.55,0.45), dim=c(2,2,2), dimnames=list(mlgr=State, "mlg"= State, "nblg"=State))
# cptMLW <-matrix(c(0.95, 0.05, 0.1, 0.9, 0.2, 0.8, 0.01, 0.99), ncol=2,nrow = 2,byrow = TRUE, dimnames=list("r"= State, "s"=State))
cptMLW <-array(c(0.95, 0.05, 0.1, 0.9, 0.2, 0.8, 0.01, 0.99), dim=c(2,2,2), dimnames=list(mlw=State, "r"= State, "s"=State))
# add child into first slot of dimnames
cptF <- c(0.05, 0.95, 0.4, 0.6, 0.9, 0.1, 0.99, 0.01, 0.9, 0.1, 0.95, 0.05, 0.95, 0.05, 0.99, 0.01)
dim(cptF) <- c(2, 2, 2, 2)
dimnames(cptF) <- list("f" = State, "s"=State, "r"=State, "mlw"=State)
# add missing node f into list
cpt <- list(s = cptS, r = cptR, mlw = cptMLW,nblw= cptNBLW, mlg= cptMLG, nblg= cptNBLG, mlgr= cptMLGR, f=cptF)
# Construct BN network with Conditional Probability Table
S.net <- custom.fit(dag, dist=cpt)

How can I create a customized colormap for geoviews (bokeh)?

I'm trying to plot an xarray dataset in Geoviews, like this:
https://geoviews.org/gallery/bokeh/xarray_image.html#bokeh-gallery-xarray-image
There I can define a colormap by cmap.
The cmap is just a list of hex-codes, like:
['#150b00',
'#9b4e00',
'#f07800',
'#ffa448',
'#a8a800',
'#dddd00',
'#ffff00',
'#ffffb3',
'#ffffff',
'#b0ffff',
'#00e8e8',
'#00bfbf',
'#008a8a',
'#79bcff',
'#0683ff',
'#0000c1',
'#000048']
I want to define to levels of values for these color, like this list:
[-10.0,
-5.0,
-2.5,
-1.0,
-0.5,
-0.2,
-0.1,
-0.05,
0.05,
0.1,
0.2,
0.5,
1.0,
2.5,
5.0,
10.0]
How can I define these levels?
Please try to set the parameter color_levels to the wanted values. This is explained in HoloViews Styling Plots in the section Custom color intervals. HoloVies is the source where the gv.Image comes from. Therefore this should work.
cmap = ['#150b00', '#9b4e00', '#f07800', '#ffa448', '#a8a800', '#dddd00', '#ffff00', '#ffffb3', '#ffffff', '#b0ffff', '#00e8e8', '#00bfbf', '#008a8a', '#79bcff', '#0683ff', '#0000c1', '#000048']
levels = [-10.0, -5.0, -2.5, -1.0, -0.5, -0.2, -0.1, -0.05, 0.05, 0.1, 0.2, 0.5, 1.0, 2.5, 5.0, 10.0]
images.opts(
cmap=cmap,
color_levels=levels,
colorbar=True,
width=600,
height=500) * gf.coastline
Comment
If this is not working, then I apologize. In the moment I am not able to install GeoViews on my machine.

unable to scrape by traversing <br> in <td> using scrapy with css

html code is following:
<td class="column-3">
(price per 1,000 images)<br>
0-1M images -
<span class="price-data " data-amount="{"regional":{"asia-pacific-southeast":0.5,"australia-east":0.5,"brazil-south":0.5,"canada-central":0.5,"central-india":0.5,"europe-north":0.5,"europe-west":0.5,"united-kingdom-south":0.5,"us-east":0.5,"us-east-2":0.5,"us-south-central":0.5,"us-west-2":0.5,"us-west-central":0.5}}" data-decimals="3" data-decimals-force="0" data-region-unavailable="N/A" data-has-valid-price="true">$0.50</span> <br>
1M-5M images -
<span class="price-data " data-amount="{"regional":{"asia-pacific-southeast":0.4,"australia-east":0.4,"brazil-south":0.4,"canada-central":0.4,"central-india":0.4,"europe-north":0.4,"europe-west":0.4,"united-kingdom-south":0.4,"us-east":0.4,"us-east-2":0.4,"us-south-central":0.4,"us-west-2":0.4,"us-west-central":0.4}}" data-decimals="3" data-decimals-force="0" data-region-unavailable="N/A" data-has-valid-price="true">$0.40</span> <br>
5M+ images -
<span class="price-data " data-amount="{"regional":{"asia-pacific-southeast":0.325,"australia-east":0.325,"brazil-south":0.325,"canada-central":0.325,"central-india":0.325,"europe-north":0.325,"europe-west":0.325,"united-kingdom-south":0.325,"us-east":0.325,"us-east-2":0.325,"us-south-central":0.325,"us-west-2":0.325,"us-west-central":0.325}}" data-decimals="3" data-decimals-force="0" data-region-unavailable="N/A" data-has-valid-price="true">$0.325</span> <br>
</td>
url: https://azure.microsoft.com/en-in/pricing/details/search/
How can I traverse <br> and scrape the data? I want to split td tags into count(br) times and then scrape. I don't want to use xpath. I want to get the result through css.
dumb = 'Your response, or above text'
html_dumb = Selector(text=dumb)
td_vals = [x.strip().strip('- ') for x in
html_dumb.xpath("//td/text()").extract() if x.strip()] #got all td values
f_val = td_vals[0] # seperate the first one. here (price per 1,000 images)
td_vals = td_vals[1:]
span_vals = [x.strip() for x in html_dumb.xpath("//span/#data-amount").extract() if x.strip()] #got all span data, you can also get span text if you need
inner_json = {}
result = {}
for td_val, span_val in zip(td_vals, span_vals):
d[td_val] = json.loads(span_val) #building inner dictionary
result[f_val] = d #append in outer one
{u'(price per 1,000 images)': {u'5M+ images': {u'regional': {u'united-kingdom-south': 0.325, u'europe-north': 0.325, u'brazil-south': 0.325, u'us-west-2': 0.325, u'us-south-central': 0.325, u'central-india': 0.325, u'us-east': 0.325, u'canada-central': 0.325, u'europe-west': 0.325, u'us-east-2': 0.325, u'us-west-central': 0.325, u'asia-pacific-southeast': 0.325, u'australia-east': 0.325}}, u'0-1M images': {u'regional': {u'united-kingdom-south': 0.5, u'europe-north': 0.5, u'brazil-south': 0.5, u'us-west-2': 0.5, u'us-south-central': 0.5, u'central-india': 0.5, u'us-east': 0.5, u'canada-central': 0.5, u'europe-west': 0.5, u'us-east-2': 0.5, u'us-west-central': 0.5, u'asia-pacific-southeast': 0.5, u'australia-east': 0.5}}, u'1M-5M images': {u'regional': {u'united-kingdom-south': 0.4, u'europe-north': 0.4, u'brazil-south': 0.4, u'us-west-2': 0.4, u'us-south-central': 0.4, u'central-india': 0.4, u'us-east': 0.4, u'canada-central': 0.4, u'europe-west': 0.4, u'us-east-2': 0.4, u'us-west-central': 0.4, u'asia-pacific-southeast': 0.4, u'australia-east': 0.4}}}}

Add value or percentage in contingency table plot

I would like to put in each box from the ploted contingency table, the value obtained by the table.
The following image represent the contingency table
Te following code is how to display the contingency table:
> svm.video.table2<-table(pred=svm.video.pred2, true= filteredDataFinal$rate)
> svm.video.table2
An this one is how to plot that table
plot(svm.video.table2)
And adhoc approach would be:
text(x = 0.23, y = 0.55, "10")
text(x = 0.23, y = 0.67, "2")
text(x = 0.64, y = 0.94, "1")
text(x = 0.64, y = 0.45, "9")
text(x = 0.92, y = 0.44, "4")
PS: I generated the data to make your example reproducible with svm.video.table2 <- as.table(matrix(c(10, 1, 0, 2, 9, 0, 0, 0, 4), ncol = 3))

Resources