How to combine rows into one row in TermDocumentMatrix? - r

Iam trying to combine rows into on row in TermDocumentMatrix
(I know every row represents each words)
ex) cabin, staff -> crews
Because 'cabin, staff and crew' mean samething,
Iam trying to combine rows which represent 'cabin, staff'
into one row which represent 'crew.
but, it doesn't work at all.
R said argument "weighting" is missing, with no default
The codes I typed is below
r=GET('http://www.airlinequality.com/airline-reviews/cathay-pacific-airways/')
base_url=('http://www.airlinequality.com/airline-reviews/cathay-pacific-airways/')
h<-read_html(base_url)
all.reviews = c()
for (i in 1:10){
print(i)
url = paste(base_url, 'page/', i, '/', sep="")
r = GET(url)
h = read_html(r)
comment_area = html_nodes(h, '.tc_mobile')
comments= html_nodes(comment_area, '.text_content')
reviews = html_text(comments)
all.reviews=c(all.reviews, reviews)}
cps <- Corpus(VectorSource(all.reviews))
cps <- tm_map(cps, content_transformer(tolower))
cps <- tm_map(cps, content_transformer(stripWhitespace))
cps <- tm_map(cps, content_transformer(removePunctuation))
cps <- tm_map(cps, content_transformer(removeNumbers))
cps <- tm_map(cps, removeWords, stopwords("english"))
tdm <- TermDocumentMatrix(cps, control=list(
wordLengths=c(3, 20),
weighting=weightTf))
rows.cabin = grep('cabin|staff', row.names(tdm))
rows.cabin
# [1] 235 1594
count.cabin = as.array(rollup(tdm[rows.cabin,], 1))
count.cabin
#Docs
#Terms 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
#1 0 1 1 0 0 2 2 0 0 1 1 0 4 0 1 0 1 0 2 1 0 0 1 3 1 4 2 0 3 0 1 1 4 0 0 2 1 0 0 2 1 0 2 1 3 3 1
#Docs
#Terms 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91
#1 0 1 0 1 2 3 2 2 1 1 0 2 0 0 0 0 0 2 0 1 0 0 4 0 2 2 1 3 1 1 1 1 0 0 0 5 3 0 2 1 0 1 0 0
#Docs
#Terms 92 93 94 95 96 97 98 99 100
#1 1 5 2 1 0 0 0 1 0
row.crews = grep('crews', row.names(tdm))
row.crews
#[1] 408
tdm[row.crews,] = count.cabin
rows.cabin = setdiff(rows.cabin, row.crews) # ok
tdm = tdm[-rows.cabin,] # ok
dtm = as.DocumentTermMatrix(tdm)
# Error in .TermDocumentMatrix(t(x), weighting) :
# argument "weighting" is missing, with no default
maybe it is not right approach to combine rows in TermDocumentMatrix
Please fix this codes or suggest better approach to solve this problem.
Thanks in advance.

Hmm I wonder why you stick to your approach, which obviously does not work, instead of just copying+pasting+adjusting* my suggestion from here?
library(tm)
library(httr)
library(rvest)
library(slam)
# [...] # your code
inspect(tdm[grep("cabin|staff|crew", Terms(tdm), ignore.case=TRUE), 1:15])
# Docs
# Terms 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
# cabin 0 0 0 0 0 1 1 0 0 1 0 0 3 0 0
# crew 0 0 0 1 1 1 1 0 2 1 0 1 0 2 0
# crews 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# staff 0 1 1 0 0 1 1 0 0 0 1 0 1 0 1
dict <- list(
"CREW" = grep("cabin|staff|crew", Terms(tdm), ignore.case=TRUE, value = TRUE)
)
terms <- Terms(tdm)
for (x in seq_along(dict))
terms[terms %in% dict[[x]] ] <- names(dict)[x]
tdm <- slam::rollup(tdm, 1, terms, sum)
inspect(tdm[grep("cabin|staff|crew", Terms(tdm), ignore.case=TRUE), 1:15])
# Docs
# Terms 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
# CREW 0 1 1 1 1 3 3 0 2 2 1 1 4 2 1
*I only adjusted the line inside the dict definition...

Related

R: table frequencies of letters in string based on Alphabet

I need to compute letter frequencies of a large list of words. For each of the locations in the word (first, second,..), I need to find how many times each letter (a-z) appeared in the list and then table the data according to the word positon.
For example, if my word list is: words <- c("swims", "seems", "gills", "draws", "which", "water")
then the result table should like that:
letter
first position
second position
third position
fourth position
fifth position
a
0
1
1
0
0
b
0
0
0
0
0
c
0
0
0
1
0
d
1
0
0
0
0
e
0
1
1
1
0
f
0
0
0
0
0
...continued until z
...
...
...
...
...
All words are of same length (5).
What I have so far is:
alphabet <- letters[1:26]
words.df <- data.frame("Words" = words)
words.df <- words.df %>% mutate("First_place" = substr(words.df$words,1,1))
words.df <- words.df %>% mutate("Second_place" = substr(words.df$words,2,2))
words.df <- words.df %>% mutate("Third_place" = substr(words.df$words,3,3))
words.df <- words.df %>% mutate("Fourth_place" = substr(words.df$words,4,4))
words.df <- words.df %>% mutate("Fifth_place" = substr(words.df$words,5,5))
x1 <- words.df$First_place
x1 <- table(factor(x1,alphabet))
x2 <- words.df$Second_place
x2 <- table(factor(x2,alphabet))
x3 <- words.df$Third_place
x3 <- table(factor(x3,alphabet))
x4 <- words.df$Fourth_place
x4 <- table(factor(x4,alphabet))
x5 <- words.df$Fifth_place
x5 <- table(factor(x5,alphabet))
My code is not effective and gives tables to each letter position sepretely. All help will be appreicated.
in base R use table:
table(let = unlist(strsplit(words,'')),pos = sequence(nchar(words)))
pos
let 1 2 3 4 5
a 0 1 1 0 0
c 0 0 0 1 0
d 1 0 0 0 0
e 0 1 1 1 0
g 1 0 0 0 0
h 0 1 0 0 1
i 0 1 2 0 0
l 0 0 1 1 0
m 0 0 0 2 0
r 0 1 0 0 1
s 2 0 0 0 4
t 0 0 1 0 0
w 2 1 0 1 0
Note that if you need all the values from a-z then use
table(factor(unlist(strsplit(words,'')), letters), sequence(nchar(words)))
Also to get a dataframe you could do:
d <- table(factor(unlist(strsplit(words,'')), letters), sequence(nchar(words)))
cbind(letters = rownames(d), as.data.frame.matrix(d))
Here is a tidyverse solution using dplyr, purrr, and tidyr:
strsplit(words.df$Words, "") %>%
map_dfr(~setNames(.x, seq_along(.x))) %>%
pivot_longer(everything(),
values_drop_na = T,
names_to = "pos",
values_to = "letter") %>%
count(pos, letter) %>%
pivot_wider(names_from = pos,
names_glue = "pos{pos}",
id_cols = letter,
values_from = n,
values_fill = 0L)
Output
letter pos1 pos2 pos3 pos4 pos5 pos6 pos7 pos8 pos9 pos10 pos11
1 a 65 127 88 38 28 17 14 5 3 0 0
2 b 58 4 7 9 2 4 2 0 1 0 0
3 c 83 14 45 37 20 19 8 3 2 0 0
4 C 2 0 0 0 0 0 0 0 0 0 0
5 d 43 8 33 47 21 22 9 3 1 1 0
6 e 45 156 81 132 114 69 48 23 14 2 2
7 f 54 11 18 10 5 2 1 0 0 0 0
8 g 23 7 27 21 15 8 7 1 0 0 0
9 h 38 56 6 28 21 10 3 3 1 1 0
10 i 25 106 51 58 38 28 8 4 1 0 0
11 j 6 0 2 2 0 0 0 0 0 0 0
12 k 9 1 6 22 12 0 0 0 0 0 0
13 l 45 41 54 54 36 9 7 6 0 2 0
14 m 45 8 31 19 8 8 4 2 0 0 0
15 n 23 42 75 53 34 41 16 16 4 2 0
16 o 28 167 76 41 38 9 11 2 1 0 0
17 p 72 20 34 30 8 3 1 1 1 0 0
18 q 7 2 1 0 0 0 0 0 0 0 0
19 r 46 74 92 59 56 45 12 9 1 1 0
20 s 119 8 67 35 31 22 18 4 1 0 0
21 t 65 30 73 83 57 42 31 9 6 3 1
22 u 12 66 39 36 20 7 7 2 0 0 0
23 v 8 7 20 12 5 5 1 0 0 0 0
24 w 53 8 13 10 2 3 0 1 0 0 0
25 y 6 4 16 15 17 15 10 5 6 1 1
26 x 0 12 5 0 0 0 0 0 0 0 0
27 z 0 0 1 0 0 0 1 1 0 0 0

Creating a new column in R

I have a data.frame like the following:
regions admit men_age group
1 1234 34 2
2 3416 51 1
3 2463 26 3
4 1762 29 2
5 2784 31 4
6 999 42 1
7 2111 23 2
8 1665 36 3
9 2341 21 4
10 1723 33 1
I would like to create new columns using admit and group as follows:
regions admit men_age group admit1 admit2 admit3 admit4
1 1234 34 2 0 1234 0 0
2 3416 51 1 3416 0 0 0
3 2463 26 3 0 0 2463 0
4 1762 29 2 0 1762 0 0
5 2784 31 4 0 0 0 2784
6 999 42 1 999 0 0 0
7 2111 23 2 0 2111 0 0
8 1665 36 3 0 0 1665 0
9 2341 21 4 0 0 0 2341
10 1723 33 1 1723 0 0 0
In fact, what I want to do is to create four new admit columns according to group column as follows: in admit 1 column, the value for rows where group is 1, put the corresponding admit number, other wise put zero. In admit 2 column, the values for rows where group is 2, put the corresponding admit number, otherwise put zero ans this applies for two other column as well.
I tried a couple of ways to solve it, but failed.
May please someone help me to solve this?
A solution using tidyverse. We can create the columns and then spread them with fill = 0.
library(tidyverse)
dat2 <- dat %>%
mutate(group2 = str_c("admit", group), admit2 = admit) %>%
spread(group2, admit2, fill = 0)
dat2
# regions admit men_age group admit1 admit2 admit3 admit4
# 1 1 1234 34 2 0 1234 0 0
# 2 2 3416 51 1 3416 0 0 0
# 3 3 2463 26 3 0 0 2463 0
# 4 4 1762 29 2 0 1762 0 0
# 5 5 2784 31 4 0 0 0 2784
# 6 6 999 42 1 999 0 0 0
# 7 7 2111 23 2 0 2111 0 0
# 8 8 1665 36 3 0 0 1665 0
# 9 9 2341 21 4 0 0 0 2341
# 10 10 1723 33 1 1723 0 0 0
DATA
dat <- read.table(text = "regions admit men_age group
1 1234 34 2
2 3416 51 1
3 2463 26 3
4 1762 29 2
5 2784 31 4
6 999 42 1
7 2111 23 2
8 1665 36 3
9 2341 21 4
10 1723 33 1",
header = TRUE)
A Base R solution would be using ifelse(). Supposed you data.frame is x, you could do this:
# create the columns with the selected values
for( i in 1:4 ) x[ i + 4 ] <- ifelse( x$group == i, x$admit, 0 )
# rename the columns to your liking
colnames( x )[ 5:8 ] <- c( "admit1", "admit2", "admit3", "admit4" )
This gives you
> x
regions admit men_age group admit1 admit2 admit3 admit4
1 1 1234 34 2 0 1234 0 0
2 2 3416 51 1 3416 0 0 0
3 3 2463 26 3 0 0 2463 0
4 4 1762 29 2 0 1762 0 0
5 5 2784 31 4 0 0 0 2784
6 6 999 42 1 999 0 0 0
7 7 2111 23 2 0 2111 0 0
8 8 1665 36 3 0 0 1665 0
9 9 2341 21 4 0 0 0 2341
10 10 1723 33 1 1723 0 0 0
If you don't like the explicit naming, you could do it in the for() loop already:
for( i in 1:4 )
{
adm <- paste ( "admit", i, sep = "" )
x[ adm ] <- ifelse( x$group == i, x$admit, 0 )
}

How to save for loop results in data frame using cbind

I have a data frame dfSub with a number of parameters inside. This is hourly based data for energy use. I need to sort data by each hour, e.g. for each hour get all values of energy from data frame. As a result I expect to have data frame with 24 columns for each hour, rows are filled with energy values.
The hour is specified as 1:24 and in data frame is linked as dfSub$hr.
The heat is dfSub$heat
I constructed a for-loop and tried to save with cbind, but it does not work, error message is about different size of rows and columns.
I print results and see them on screen, but cant save as d(dataframe)
here is the code:
d = NULL
for (i in 1:24) {
subh= subset(dfSub$heat, dfSub$hr == i)
print(subh)
d = cbind(d, as.data.frame(subh))
}
append function is not applicable, since I dont know the expected length of heat value for each hour.
Any help is appreciated.
Part of dfSub
hr wk month dyid wend t heat
1 2 1 1 0 -9.00 81
2 2 1 1 0 -8.30 61
3 2 1 1 0 -7.80 53
4 2 1 1 0 -7.00 51
5 2 1 1 0 -7.00 30
6 2 1 1 0 -6.90 31
7 2 1 1 0 -7.10 51
8 2 1 1 0 -6.50 90
9 2 1 1 0 -8.90 114
10 2 1 1 0 -9.90 110
11 2 1 1 0 -11.70 126
12 2 1 1 0 -9.70 113
13 2 1 1 0 -11.60 104
14 2 1 1 0 -10.00 107
15 2 1 1 0 -10.20 117
16 2 1 1 0 -9.00 90
17 2 1 1 0 -8.00 114
18 2 1 1 0 -7.80 83
19 2 1 1 0 -8.10 82
20 2 1 1 0 -8.20 61
21 2 1 1 0 -8.80 34
22 2 1 1 0 -9.10 52
23 2 1 1 0 -10.10 41
24 2 1 1 0 -8.80 52
1 2 1 2 0 -8.70 44
2 2 1 2 0 -8.40 50
3 2 1 2 0 -8.10 33
4 2 1 2 0 -7.70 41
5 2 1 2 0 -7.80 33
6 2 1 2 0 -7.50 43
7 2 1 2 0 -7.30 40
8 2 1 2 0 -7.10 8
The output expected as:
hr1 hr2 hr3 hr4..... hr24
81 61 53 51 ..... 52
44 50 33 41
One can avoid use of for-loop in this case. An option is to use tidyr::spread to convert your hourly data in wide format.
library(tidyverse)
df %>% select(-t, -wend) %>%
mutate(hr = sprintf("hr%02d",hr)) %>%
spread(hr, heat)
Result:
# wk month dyid hr01 hr02 hr03 hr04 hr05 hr06 hr07 hr08 hr09 hr10 hr11 hr12 hr13 hr14 hr15 hr16 hr17 hr18 hr19 hr20 hr21 hr22 hr23 hr24
# 1 2 1 1 81 61 53 51 30 31 51 90 114 110 126 113 104 107 117 90 114 83 82 61 34 52 41 52
# 2 2 1 2 44 50 33 41 33 43 40 8 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
Data:
df <- read.table(text =
"hr wk month dyid wend t heat
1 2 1 1 0 -9.00 81
2 2 1 1 0 -8.30 61
3 2 1 1 0 -7.80 53
4 2 1 1 0 -7.00 51
5 2 1 1 0 -7.00 30
6 2 1 1 0 -6.90 31
7 2 1 1 0 -7.10 51
8 2 1 1 0 -6.50 90
9 2 1 1 0 -8.90 114
10 2 1 1 0 -9.90 110
11 2 1 1 0 -11.70 126
12 2 1 1 0 -9.70 113
13 2 1 1 0 -11.60 104
14 2 1 1 0 -10.00 107
15 2 1 1 0 -10.20 117
16 2 1 1 0 -9.00 90
17 2 1 1 0 -8.00 114
18 2 1 1 0 -7.80 83
19 2 1 1 0 -8.10 82
20 2 1 1 0 -8.20 61
21 2 1 1 0 -8.80 34
22 2 1 1 0 -9.10 52
23 2 1 1 0 -10.10 41
24 2 1 1 0 -8.80 52
1 2 1 2 0 -8.70 44
2 2 1 2 0 -8.40 50
3 2 1 2 0 -8.10 33
4 2 1 2 0 -7.70 41
5 2 1 2 0 -7.80 33
6 2 1 2 0 -7.50 43
7 2 1 2 0 -7.30 40
8 2 1 2 0 -7.10 8",
header = TRUE, stringsAsFactors = FALSE)
With tidyr:
> df<-read.fwf(textConnection(
+ "hr,wk,month,dyid,wend,t,heat
+ 1 2 1 1 0 -9.00 81
+ 2 2 1 1 0 -8.30 61
+ 3 2 1 1 0 -7.80 53
+ 4 2 1 1 0 -7.00 51
+ 5 2 1 1 0 -7.00 30
+ 6 2 1 1 0 -6.90 31
+ 7 2 1 1 0 -7.10 51
+ 8 2 1 1 0 -6.50 90
+ 9 2 1 1 0 -8.90 114
+ 10 2 1 1 0 -9.90 110
+ 11 2 1 1 0 -11.70 126
+ 12 2 1 1 0 -9.70 113
+ 13 2 1 1 0 -11.60 104
+ 14 2 1 1 0 -10.00 107
+ 15 2 1 1 0 -10.20 117
+ 16 2 1 1 0 -9.00 90
+ 17 2 1 1 0 -8.00 114
+ 18 2 1 1 0 -7.80 83
+ 19 2 1 1 0 -8.10 82
+ 20 2 1 1 0 -8.20 61
+ 21 2 1 1 0 -8.80 34
+ 22 2 1 1 0 -9.10 52
+ 23 2 1 1 0 -10.10 41
+ 24 2 1 1 0 -8.80 52
+ 1 2 1 2 0 -8.70 44
+ 2 2 1 2 0 -8.40 50
+ 3 2 1 2 0 -8.10 33
+ 4 2 1 2 0 -7.70 41
+ 5 2 1 2 0 -7.80 33
+ 6 2 1 2 0 -7.50 43
+ 7 2 1 2 0 -7.30 40
+ 8 2 1 2 0 -7.10 8"
+ ),header=TRUE,sep=",",widths=c(5,3,6,5,5,7,5))
>
> library(tidyr)
> df1 <- select(df,dyid,hr,heat)
> df2 <- spread(df1,hr,heat)
> colnames(df2)[2:ncol(df2)] <- paste0("hr",colnames(df2)[2:ncol(df2)])
> df2
dyid hr1 hr2 hr3 hr4 hr5 hr6 hr7 hr8 hr9 hr10 hr11 hr12 hr13 hr14 hr15 hr16 hr17 hr18 hr19 hr20 hr21 hr22 hr23 hr24
1 1 81 61 53 51 30 31 51 90 114 110 126 113 104 107 117 90 114 83 82 61 34 52 41 52
2 2 44 50 33 41 33 43 40 8 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
>
I found solution that helped me to solve my task here: Append data frames together in a for loop
by using empty list and combining later on in data frame
datalist = list()
for (i in 1:24) {
subh= subset(dfSub$heat, dfSub$hr == i)
datalist[[i]] = subh
}
big_data = do.call(rbind, datalist)
both cbind and rbind work.
Thanks everyone for help :)

Create one column out of several columns Rstudio or Excel

I've tried to find an already existing question on this matter but I couldn't so that is why I'm asking here you:
Summary:
I want to make ONE column out of several Columns. All the values in the columns are put in the same order as they are and also, the columns should be stacked below each other.
Description and details
Below is an example of how my csv.file look like. However, note that there is >400 columns and that's why I don't want to do it manually in for example Excel. ALL columns has 24 rows each.
X1 X2 X3 X4 X5 X6 ... X470
0 1 5 10 8 0 7
0 0 0 0 0 0 0
2 3 0 0 0 0 0
0 0 0 0 0 0 0
0 0 0 0 0 0 0
0 0 0 0 0 0 0
0 0 0 0 0 0 0
0 0 0 0 0 0 0
0 0 0 0 0 0 0
0 0 0 0 0 0 0
0 0 0 0 0 0 0
0 0 0 0 0 0 0
0 0 0 0 0 0 0
0 0 0 0 0 0 0
0 0 0 0 0 0 0
0 0 0 0 0 0 0
0 0 0 0 0 0 0
0 0 0 0 0 0 0
0 0 0 0 0 0 0
0 0 0 0 0 0 0
0 0 0 0 0 0 0
0 0 0 0 0 0 0
0 0 0 0 0 0 0
0 0 0 0 0 0 0
I want to "stack" all the columns in one column, as I've shortly described in the summary:
Info: The sign "..." below implies the rest of the values from that column.
VALUE FROM COLUMN
0 X1
0 X1
2 X1
...
1 X2
0 X2
3 X2
...
5 X3
...
10 X4
...
8 X5
...
0 X6
...
7 X470
...
So in the end, instead of having 486 column where each of them have 24 rows. I will have 1 column with 11664 rows. It would be good if the origin column is written in a new column on the side (as showed above) but this is not "obligated".
OBS! Note that I've with this df just showed in general what I want to achieve, so clear and understandable commands are appreciated as I will apply it to my df.
It doesn't matter if the solution is done in R or Excel! As long as it is easy to do
I hope my description is clear, otherwise please let me know so I can try to describe again.
Many thanks for suggestions and help.
Kind regards, Elin
We can use stack to get the values in one column and the colnames in the next.
stack(df)
Or use unlist
data.frame(VALUE=unlist(df),
fromColumn= rep(names(df), each=nrow(df)))
Here's a VBA user defined function to do the job:
Function ConcatCols(Colrange As Variant) As Variant
Dim LongCol() As Variant, i As Long, j As Long, k As Long
Dim NumCols As Long, NumRows As Long, NumRows2 As Long
If TypeName(Colrange) = "Range" Then Colrange = Colrange.Value2
NumRows = UBound(Colrange)
NumCols = UBound(Colrange, 2)
NumRows2 = NumRows * NumCols
ReDim LongCol(1 To NumRows2, 1 To 1)
k = 1
For i = 1 To NumCols
For j = 1 To NumRows
LongCol(k, 1) = Colrange(j, i)
k = k + 1
Next j
Next i
ConcatCols = LongCol
End Function
Enter the code in a VBA module then enter =ConcatCols(A1:RL24) as an array function (Ctrl-Shift-Enter) in column RM (or wherever you want) to view the entire concatenated column, or call from a VBA sub to write the data to the spreadsheet.
The following is pretty simple but requires loading the reshape2 package which I think is included in base. As suggested above, stack() gives similar output, but reverses the columns.
library(reshape2)
df <- data.frame("A" = 1:21, "B" = 21:41, "C" = 40:60)
> df
A B C
1 1 21 40
2 2 22 41
3 3 23 42
4 4 24 43
5 5 25 44
6 6 26 45
7 7 27 46
8 8 28 47
9 9 29 48
10 10 30 49
11 11 31 50
12 12 32 51
13 13 33 52
14 14 34 53
15 15 35 54
16 16 36 55
17 17 37 56
18 18 38 57
19 19 39 58
20 20 40 59
21 21 41 60
melt(df)
> melt(df)
No id variables; using all as measure variables
variable value
1 A 1
2 A 2
3 A 3
4 A 4
5 A 5
6 A 6
7 A 7
8 A 8
9 A 9
10 A 10
11 A 11
12 A 12
13 A 13
14 A 14
15 A 15
16 A 16
17 A 17
18 A 18
19 A 19
20 A 20
21 A 21
22 B 21
23 B 22
24 B 23
25 B 24
26 B 25
27 B 26
28 B 27
29 B 28
30 B 29
31 B 30
32 B 31
33 B 32
34 B 33
35 B 34
36 B 35
37 B 36
38 B 37
39 B 38
40 B 39
41 B 40
42 B 41
43 C 40
44 C 41
45 C 42
46 C 43
47 C 44
48 C 45
49 C 46
50 C 47
51 C 48
52 C 49
53 C 50
54 C 51
55 C 52
56 C 53
57 C 54
58 C 55
59 C 56
60 C 57
61 C 58
62 C 59
63 C 60

Plotting nonlinear decision boundary

I'm trying to plot a nonlinear decision boundary, that is supposed to look like this:
I have fitted a regularized nonlinear logistic regression of the form:
This is an extract of my data:
ones test1 test2 use
1 1 0.051267 0.69956 1
2 1 -0.092742 0.68494 1
3 1 -0.213710 0.69225 1
4 1 -0.375000 0.50219 1
5 1 -0.513250 0.46564 1
6 1 -0.524770 0.20980 1
These are the parameters I have calculated using the optim() function:
[1] 0.377980476 -0.085951551 0.445140731
[4] -1.953080687 -0.506554404 -0.330330236
[7] 0.414649938 0.270281786 0.183804530
[10] -0.155359467 -0.753665545 0.351880543
[13] 0.238052214 0.619714119 -0.582420943
[16] 0.150625144 0.266319363 -0.331130949
[19] 0.177759335 -0.005402135 -0.124253913
[22] 0.085607070 0.580258782 0.973785263
[25] 0.387313615 0.237754576 -0.011198804
[28] -0.514447404
I'm still new to R, and I don't really have any idea on how to tackle this problem, can anybody help me out please?
ones test1 test2 use
1 1 0.0512670 0.699560 1
2 1 -0.0927420 0.684940 1
3 1 -0.2137100 0.692250 1
4 1 -0.3750000 0.502190 1
5 1 -0.5132500 0.465640 1
6 1 -0.5247700 0.209800 1
7 1 -0.3980400 0.034357 1
8 1 -0.3058800 -0.192250 1
9 1 0.0167050 -0.404240 1
10 1 0.1319100 -0.513890 1
11 1 0.3853700 -0.565060 1
12 1 0.5293800 -0.521200 1
13 1 0.6388200 -0.243420 1
14 1 0.7367500 -0.184940 1
15 1 0.5466600 0.487570 1
16 1 0.3220000 0.582600 1
17 1 0.1664700 0.538740 1
18 1 -0.0466590 0.816520 1
19 1 -0.1733900 0.699560 1
20 1 -0.4786900 0.633770 1
21 1 -0.6054100 0.597220 1
22 1 -0.6284600 0.334060 1
23 1 -0.5938900 0.005117 1
24 1 -0.4210800 -0.272660 1
25 1 -0.1157800 -0.396930 1
26 1 0.2010400 -0.601610 1
27 1 0.4660100 -0.535820 1
28 1 0.6733900 -0.535820 1
29 1 -0.1388200 0.546050 1
30 1 -0.2943500 0.779970 1
31 1 -0.2655500 0.962720 1
32 1 -0.1618700 0.801900 1
33 1 -0.1733900 0.648390 1
34 1 -0.2828300 0.472950 1
35 1 -0.3634800 0.312130 1
36 1 -0.3001200 0.027047 1
37 1 -0.2367500 -0.214180 1
38 1 -0.0639400 -0.184940 1
39 1 0.0627880 -0.163010 1
40 1 0.2298400 -0.411550 1
41 1 0.2932000 -0.228800 1
42 1 0.4832900 -0.184940 1
43 1 0.6445900 -0.141080 1
44 1 0.4602500 0.012427 1
45 1 0.6273000 0.158630 1
46 1 0.5754600 0.268270 1
47 1 0.7252300 0.443710 1
48 1 0.2240800 0.524120 1
49 1 0.4429700 0.670320 1
50 1 0.3220000 0.692250 1
51 1 0.1376700 0.575290 1
52 1 -0.0063364 0.399850 1
53 1 -0.0927420 0.553360 1
54 1 -0.2079500 0.355990 1
55 1 -0.2079500 0.173250 1
56 1 -0.4383600 0.217110 1
57 1 -0.2194700 -0.016813 1
58 1 -0.1388200 -0.272660 1
59 1 0.1837600 0.933480 0
60 1 0.2240800 0.779970 0
61 1 0.2989600 0.619150 0
62 1 0.5063400 0.758040 0
63 1 0.6157800 0.728800 0
64 1 0.6042600 0.597220 0
65 1 0.7655500 0.502190 0
66 1 0.9268400 0.363300 0
67 1 0.8231600 0.275580 0
68 1 0.9614100 0.085526 0
69 1 0.9383600 0.012427 0
70 1 0.8634800 -0.082602 0
71 1 0.8980400 -0.206870 0
72 1 0.8519600 -0.367690 0
73 1 0.8289200 -0.521200 0
74 1 0.7943500 -0.557750 0
75 1 0.5927400 -0.740500 0
76 1 0.5178600 -0.594300 0
77 1 0.4660100 -0.418860 0
78 1 0.3508100 -0.579680 0
79 1 0.2874400 -0.769740 0
80 1 0.0858290 -0.755120 0
81 1 0.1491900 -0.579680 0
82 1 -0.1330600 -0.448100 0
83 1 -0.4095600 -0.411550 0
84 1 -0.3922800 -0.258040 0
85 1 -0.7436600 -0.258040 0
86 1 -0.6975800 0.041667 0
87 1 -0.7551800 0.290200 0
88 1 -0.6975800 0.684940 0
89 1 -0.4038000 0.706870 0
90 1 -0.3807600 0.918860 0
91 1 -0.5074900 0.904240 0
92 1 -0.5478100 0.706870 0
93 1 0.1031100 0.779970 0
94 1 0.0570280 0.918860 0
95 1 -0.1042600 0.991960 0
96 1 -0.0812210 1.108900 0
97 1 0.2874400 1.087000 0
98 1 0.3968900 0.823830 0
99 1 0.6388200 0.889620 0
100 1 0.8231600 0.663010 0
101 1 0.6733900 0.641080 0
102 1 1.0709000 0.100150 0
103 1 -0.0466590 -0.579680 0
104 1 -0.2367500 -0.638160 0
105 1 -0.1503500 -0.367690 0
106 1 -0.4902100 -0.301900 0
107 1 -0.4671700 -0.133770 0
108 1 -0.2885900 -0.060673 0
109 1 -0.6111800 -0.067982 0
110 1 -0.6630200 -0.214180 0
111 1 -0.5996500 -0.418860 0
112 1 -0.7263800 -0.082602 0
113 1 -0.8300700 0.312130 0
114 1 -0.7206200 0.538740 0
115 1 -0.5938900 0.494880 0
116 1 -0.4844500 0.999270 0
117 1 -0.0063364 0.999270 0
118 1 0.6326500 -0.030612 0
Though not a ideal answer, you can use a SVM model to visualize this (it gives ~0.83 in-sample error):
require(e1071)
data = data[, c("use", "test1", "test2")]
fit = svm(use ~ ., data = data)
plot(fit, data = data)
Using a simple transformation we can try to get a linearly separable dataset:
data2 = data.frame(
y = factor(data[, "use"]),
x1 = data[, "test1"]^2,
x2 = data[, "test2"]^2 )
require(MASS)
fit = glm(y ~ x2 + x1, data = data2, family = binomial(link = "logit"))
plot(x2 ~ x1, data = data2, bg = as.numeric(y) + 1, pch = 21, main = "Logistic regression on Y ~ X1 + X2")
abline(-fit$coefficients[1]/fit$coefficients[2], -fit$coefficients[3]/fit$coefficients[2], col = 'blue', lwd = 2)
Which gives you this (~ 0.73 in-sample error):
So now you got
Y = w0 + w1 * test1^2 + w2 * test2^2
Which you can use to isolate test2 = f(test1) and plot the non-linear boundary.

Resources