KNN for text classification, but train and class have different lengths in R - r

Hello I am trying to classify text, here is the code
df <- read.csv("D:/AS/tokpedprepro.csv")
#sampling
set.seed(123)
df <- df[sample(nrow(df)),]
df <- df[sample(nrow(df)),]
#Convert to corpus
dfCorpus <- Corpus(VectorSource(df$text))
inspect(dfCorpus[1:20])
#convert DTM
dtm <- DocumentTermMatrix(dfCorpus)
inspect(dtm[1:4, 3:7])
#Data Partition
df.train <- df[1:20,]
df.test <- df[21:37,]
dtm.train <- dtm[1:20,]
dtm.test <- dtm[21:37,]
df.Corpus.train <- dfCorpus[1:20]
df.corpus.test <- dfCorpus[21:37]
train.class <- df$data.class
#TFIDF
dtm.train.knn <- DocumentTermMatrix(df.Corpus.train, control = list(weighting =
function(x) weightTfIdf(x, normalize = FALSE)))
dim(dtm.train.knn)
The dimension is
[1] 20 194
dtm.test.knn <- DocumentTermMatrix(df.corpus.test, control = list(weighting =
function(x) weightTfIdf(x, normalize = FALSE)))
dim(dtm.test.knn)
the dimension is
[1] 17 211
Then
knn.pred <- knn(dtm.train.knn, dtm.test.knn, train.class, k=1 )
But error
'train' and 'class' have different lengths
What should i do?
Thanks

Your train.class is train.class <- df$data.class, but your dtm.train.knn is based on dfCorpus[1:20]. You need to change length of your train.class, probably as train.class <- df$data.class[1:20].

Related

Convert multiple list to nested list of given structure using function [R]

I have a set of lists which I would like to convert into the nested list of a certain structure. My initial data look like list_1_1 ... list_2_2. I would like them to be like final_desired_output.
I can do this step by step by extracting desired variable and appending to the output list one by one. However, this dummy example contains only 2 data subsets (first_lists and list second_lists), while the real life data are far >1 GB. Thus, I would like to do it with a function, which I unfortunatly do not know how to do, as nested lists are not well covered in tutorials. Any assistance?
# some dummy data
one_1 <- c(1:10)
one_2 <- c(2:15)
one_3 <- c(3:20)
starting_one_1 <- 1
starting_one_2 <- 2
starting_one_3 <- 3
ending_one_1 <- c(11)
ending_one_2 <- c(16)
ending_one_3 <- c(21)
two_1 <- c(1:100)
two_2 <- c(1:15)
starting_two_1 <- 5
starting_two_2 <- 10
ending_two_1 <- c(101)
ending_two_2 <- c(16)
# lists mimicking output I currently have
list_1_1 <- list(one_1, one_2, one_3)
list_1_2 <- list(starting_one_1, starting_one_2, starting_one_3)
list_1_3 <- list(ending_one_1, ending_one_2, ending_one_3)
list_2_1 <- list(two_1, two_2)
list_2_2 <- list(starting_two_1, starting_two_2)
list_2_3 <- list(ending_two_1, ending_two_2)
# producing desired otput
list_1_1_desired <- list()
list_1_1_desired[["sequence"]] <- one_1
list_1_1_desired[["starting"]] <- starting_one_1
list_1_1_desired[["ending"]] <- ending_one_1
list_1_2_desired <- list()
list_1_2_desired[["sequence"]] <- one_2
list_1_2_desired[["starting"]] <- starting_one_2
list_1_2_desired[["ending"]] <- ending_one_2
list_1_3_desired <- list()
list_1_3_desired[["sequence"]] <- one_3
list_1_3_desired[["starting"]] <- starting_one_3
list_1_3_desired[["ending"]] <- ending_one_3
list_2_1_desired <- list()
list_2_1_desired[["sequence"]] <- two_1
list_2_1_desired[["starting"]] <- starting_two_1
list_2_1_desired[["ending"]] <- ending_two_1
list_2_2_desired <- list()
list_2_2_desired[["sequence"]] <- two_2
list_2_2_desired[["starting"]] <- starting_two_2
list_2_2_desired[["ending"]] <- ending_two_2
first_lists <- list(list_1_1_desired, list_1_2_desired, list_1_3_desired)
names(first_lists) <- c("one_1", "one_2", "one_3")
second_lists <- list(list_2_1_desired, list_2_2_desired)
names(second_lists) <- c("two_1", "two_2")
# this is what I would like to obtain
final_desired_output <- list()
final_desired_output[["one"]] <- first_lists
final_desired_output[["two"]] <- second_lists
You could use purrr::transpose:
out <- mget(ls(pattern = '^list.*\\d$')) %>%
split(sub("_\\d+$", '', names(.))) %>%
map(~transpose(set_names(.,c('sequence', 'starting', 'ending'))))
all.equal(out, final_desired_output, check.attributes = FALSE)
[1] TRUE

Still getting Error in split == FALSE : comparison (1) is possible only for atomic and list types

This topic has already been discussed in this feed but I still get the same error after following the suggestions when I try to split my data data_csv with the following code:
x <- model.matrix(RET ~ ., data_csv)[,-1]
y <- data_csv$RET
#install.packages("caTools")
library(caTools)
set.seed(101)
split <- sample.split(data_csv$RET, SplitRatio =0.8)
train.set <- subset(data_csv, split == TRUE)
test.set <- subset(data_csv, split == FALSE)
y.train <- train.set$RET
y.test <- test.set&RET
X.train <- as.matrix(subset(train.set,select=-c(RET)))
X.test <- as.matrix(subset(test.set,select=-c(RET)))

Can't generate word cloud by cluster number using R

I am trying to generate a word cloud by cluster but it gives error x must be an array of atleast two dimensions, I am using twitter data -> corpus -> textminig -> document term matrix -> kmeans clustering -> word cloud by each cluster.
library(tm)
library(SnowballC)
library(XML)
library(streamR)
library(wordcloud)
library(NLP)
library(fpc)
library(cluster)
tweetsDF <- parseTweets('tweetsStream.txt', simplify = FALSE)
names(tweetsDF)
corp = Corpus(VectorSource(tweetsDF$text))
inspect(corp[1:1])
corp = Corpus(VectorSource(corp))
dtm = DocumentTermMatrix(corp)
inspect(dtm)
tdm = TermDocumentMatrix(corp)
freq = colSums(as.matrix(dtm))
length(freq)
freq= sort(colSums(as.matrix(dtm)), decreasing = TRUE)
head(freq, 14)
d= dist(t(dtm), method="euclidian")
kfit <- kmeans(d, 2)
clusplot(as.matrix(d), kfit$cluster, color=T, shade=T, labels=2, lines=0)
docs1 = names(which(kfit$cluster ==2))
docs1 = as.matrix(docs1)
v1= sort(colSums((docs1)), decreasing= TRUE)
error x must be an array of at least two dimension
myNames1 = names(v1)
d1 = data.frame(word=myNames1, freq=v1)
wordcloud(d1$word, d1$freq, min.freq=2)
output of dput
You are not collecting the term data after clustering to determine the word clouds....
What you what should be something like this:
library(slam)
docs1 <- which(kfit$cluster ==2)
head(docs1); length(docs1)
docs1 <- tdm[docs1, ]
head(docs1)
d1 <- data.frame(word=rownames(docs1), freq=row_sums(docs1))
head(d1)
d1 <- d1[order(d1$freq), ]
wordcloud(d1$word, d1$freq, min.freq=2)
Minimal example:
Using some built in data I have done kmeans clustering and generated a wordcloud based on one of the clusters
library(tm)
library(wordcloud)
library(slam)
data("acq")
dtm = DocumentTermMatrix(acq)
inspect(dtm)
tdm <- TermDocumentMatrix(acq)
freq = colSums(as.matrix(dtm))
length(freq)
freq= sort(colSums(as.matrix(dtm)), decreasing = TRUE)
head(freq, 14)
d= dist(t(dtm), method="euclidian")
kfit <- kmeans(d, 2)
clusplot(as.matrix(d), kfit$cluster, color=T, shade=T, labels=2, lines=0)
docs1 <- which(kfit$cluster ==2)
head(docs1); length(docs1)
docs1 <- tdm[docs1, ]
inspect(docs1)
d1 <- data.frame(word=rownames(docs1), freq=row_sums(docs1))
head(d1)
d1 <- d1[order(d1$freq), ]
wordcloud(d1$word, d1$freq, min.freq=2)
As a side note: posting an image of your dput statement doesn;t help as we cannot use this to generate your data on our machines.

Simpler method to insert dataframe variable and name when creating many dataframes from raster type

Is there a simpler way to designate new dataframe rows and rownames in the creation of a data frame from raster data?
rastA <- raster("rasterA.txt")
rastB <- raster("rasterB.txt")
rastC <- raster("rasterC.txt")
rastD <- raster("rasterD.txt")
rastE <- raster("rasterE.txt")
dfA <- as.data.frame(rastA)
dfB <- as.data.frame(rastB)
dfC <- as.data.frame(rastC)
dfD <- as.data.frame(rastD)
dfE <- as.data.frame(rastE)
# Renaming column in dataframe
names(dfA)[1] <- 'values'
names(dfB)[1] <- 'values'
names(dfC)[1] <- 'values'
names(dfD)[1] <- 'values'
names(dfE)[1] <- 'values'
# Adding new column with classifier 'X'
dfA$type <- 'X'
dfB$type <- 'X'
dfC$type <- 'X'
dfD$type <- 'X'
dfE$type <- 'X'
df_AB <- rbind.data.frame(dfA, dfB)
df_AC <- rbind.data.frame(dfA, dfC)
df_AD <- rbind.data.frame(dfA, dfD)
With the final combined data frames fed into ggplot to generate various histogram and density plots. This method (line by line) is easy enough, but I am wondering what efficiencies can be gained by using different methods.
Here is an approach that simplifies part of this
f <- system.file("external/test.grd", package="raster")
fls <- c(f, f, f, f, f)
s <- stack(fls) * 1:5
names(s) <- LETTERS[1:5]
df <- as.data.frame(s)
df <- na.omit(df)
I would expect that for most plots, df is what you want to use, and that would not not need to create all these separate objects that you do. However, if that is what you want, perhaps do
x <- reshape(df, varying=colnames(df), v.name='values', timevar='group', times=colnames(df), direction='long', new.row.names=NULL)
# see http://www.ats.ucla.edu/stat/r/faq/reshape.htm
rownames(x) <- NULL
x$id <- NULL
x$type <- 'X'
df_AB <- x[x$group %in% c('A', 'B'), ]
# etc

R caret package (rpart)

I get the below error when using rpart library
dt <- rpart(formula, method="class", data=full.df.allAttr.train);
Error in model.frame.default(formula = formula, data = full.df.allAttr.train, :
object is not a matrix
When i convert full.df.allAttr.trainto matrix
dt <- rpart(formula, method="class", data= as.matrix( full.df.allAttr.train));
Error in model.frame.default(formula = formula, data = as.matrix(full.df.allAttr.train), :
'data' must be a data.frame, not a matrix or an array
When i check for the class type its a data frame
class(full.df.allAttr.train)
[1] "data.frame"
thank you for the inputs , the error went off when i created the formula with the proper column name which has the outcomes.
measurevar <- "SpeakerName"
formula_str <- paste(measurevar, paste(rowNames, collapse=" + "), sep=" ~ ")
formula <- as.formula(formula_str)
It give a different error since my data frame has row.names as text below is the snapshot
Error in model.frame.default(formula = formula, data = full.df.train, :
variable lengths differ (found for 'character(0)')
Sorry new to this i will add the full source code and data sets
library(tm)
library(rpart)
obamaCorpus <- Corpus(DirSource(directory = "D:/R/Chap 6/Speeches/obama" , encoding="UTF-8"))
romneyCorpus <- Corpus(DirSource(directory = "D:/R/Chap 6/Speeches/romney" , encoding="UTF-8"))
fullCorpus <- c(obamaCorpus,romneyCorpus)#1-22 (obama), 23-44(romney)
fullCorpus.cleansed <- tm_map(fullCorpus, removePunctuation)
fullCorpus.cleansed <- tm_map(fullCorpus.cleansed, stripWhitespace)
fullCorpus.cleansed <- tm_map(fullCorpus.cleansed, tolower)
fullCorpus.cleansed <- tm_map(fullCorpus.cleansed, removeWords, stopwords("english"))
fullCorpus.cleansed <- tm_map(fullCorpus.cleansed, PlainTextDocument)
#fullCorpus.cleansed <- tm_map(fullCorpus.cleansed, stemDocument)
full.dtm <- DocumentTermMatrix(fullCorpus.cleansed)
full.dtm.spars <- removeSparseTerms(full.dtm , 0.6)
full.matix <- data.matrix(full.dtm.spars)
full.df <- as.data.frame(full.matix)
full.df[,"SpeakerName"] <- "obama"
full.df$SpeakerName[21:44] <- "romney"
train.idx <- sample(nrow(full.df) , ceiling(nrow(full.df)* 0.6))
test.idx <- (1:nrow(full.df))[-train.idx]
rowNames <- colnames(full.df)
measurevar <- "SpeakerName"
formula_str <- paste(measurevar, paste(rowNames, collapse=" + "), sep=" ~ ")
formula <- as.formula(formula_str)
dt <- rpart(formula, method="class", data=full.df.train);
Fails at the last step
Data Sets are here
https://drive.google.com/folderview?id=0B1SogodTE-kJSHF6aFRmQURsV0U&usp=sharing
You forgot to include full.df.train and your formula is not fine.
This will work:
full.df.train <- full.df[train.idx, ]
dt <- rpart(SpeakerName ~ ., method = "class", data = full.df.train)
The problem with your formula is that you include SpeakerName in both sides of ~. If you want to use all variables, using the .expression is much easier and compact.

Resources