I am very new to R and I am trying to do an NGram WordCloud. However, my results always show a 1Gram instead of an NGram. I have searched for days for answers on the web and tried different methods...still the same result. Also, for some reason, I don't have the Ngramtokenizer function that I see everyone is using. However, I found another tokenizer function that I am using here. I hope someone can help me out. Thanks in advance!
library(dplyr)
library(ggplot2)
library(tidytext)
library(wordcloud)
library(tm)
library(RTextTools)
library(readxl)
library(qdap)
library(RWeka)
library(tau)
library(quanteda)
rm(list = ls())
#setwd("C:\\RStatistics\\Data\\")
#allverbatims <-read_excel("RS_Verbatims2018.xlsx") #reads excel files
#selgroup <- subset(allverbatims, FastNPS=="Detractors")
#selcolumns <- selgroup[ ,3:8]
#sample data
selcolumns <- c("this is a test","my test is not working","sample data here")
Comments <- Corpus(VectorSource(selcolumns))
CommentClean <- tm_map(Comments, removePunctuation)
CommentClean <- tm_map(CommentClean, content_transformer(tolower))
CommentClean <- tm_map(CommentClean,removeNumbers)
CommentClean <- tm_map(CommentClean, stripWhitespace)
CommentClean <- tm_map(CommentClean,removeWords,c(stopwords('english')))
#create manual tokenizer using tau textcnt since NGramTokenizer is not available
tokenize_ngrams <- function(x, n=2) return(rownames(as.data.frame(unclass(textcnt(x,method="string", n=n)))))
#test tokenizer
head(tokenize_ngrams(CommentClean))
td_mat <- TermDocumentMatrix(CommentClean, control = list(tokenize = tokenize_ngrams))
inspect(td_mat) #should be bigrams but the result is 1 gram
matrix <- as.matrix(td_mat)
sorted <- sort(rowSums(matrix),decreasing = TRUE)
data_text <- data.frame(word = names(sorted),freq = sorted)
set.seed(1234)
wordcloud(word = data_text$word, freq = data_text$freq, min = 5, max.words = 100, random.order = FALSE, rot.per = 0.1, colors = rainbow(30))
Related
I have dataframe with a column which includes strings of text, which I would like to do some analysis on. I would like to know what are the most used words and visualize this in a wordcloud. For single words (unigrams) I've managed to do so, but I'm failing in making my code work for n-grams (e.g. bigrams, trigrams). Here I've included my code for the unigrams. I'm open to adjusting my code to make it work, or to have a complete new piece of code. How would I best approach this?
library(wordcloud)
library(RColorBrewer)
library(wordcloud2)
library(tm)
library(stringr)
#Delete special characters and lower text
df$text <- str_replace_all(df$text, "[^[:alnum:]]", " ")
df$text <- tolower(df$text)
#From df to Corpus
corpus <- Corpus(VectorSource(df))
#Remove english stopwords,
stopwords<-c(stopwords("english"))
corpus <- tm_map(corpus, removeWords,stopwords)
rm(stopwords)
#Make term document matrix
tdm <- TermDocumentMatrix(corpus,control=list(wordLenths=c(1,Inf)))
#Make list of most frequent words
tdm_freq <- as.matrix(tdm)
words <- sort(rowSums(tdm_freq),decreasing=TRUE)
tdm_freq <- data.frame(word = names(words),freq=words)
rm(words)
#Make a wordcloud
wordcloud2(tdm_freq, size = 0.4, minSize = 10, gridSize = 0,
fontFamily = 'Segoe UI', fontWeight = 'normal',
color = 'red', backgroundColor = "white",
minRotation = -pi/4, maxRotation = pi/4, shuffle = TRUE,
rotateRatio = 0.4, shape = 'circle', ellipticity = 0.8,
widgetsize = NULL, figPath = NULL, hoverFunction = NULL)
Change Corpus to VCorpus so tokenising will work.
# Data
df <- data.frame(text = c("I have dataframe with a column I have dataframe with a column",
"I would like to know what are the most I would like to know what are the most",
"For single words (unigrams) I've managed to do so For single words (unigrams) I've managed to do so",
"Here I've included my code for the unigrams Here I've included my code for the unigrams"))
# VCorpus
corpus <- VCorpus(VectorSource(df))
funs <- list(stripWhitespace,
removePunctuation,
function(x) removeWords(x, stopwords("english")),
content_transformer(tolower))
corpus <- tm_map(corpus, FUN = tm_reduce, tmFuns = funs)
# Tokenise data without requiring any particular package
ngram_token <- function(x) unlist(lapply(ngrams(words(x), 2), paste, collapse=" "), use.names=FALSE)
# Pass into TDM control argument
tdm <- TermDocumentMatrix(corpus, control = list(tokenize = ngram_token))
freq <- rowSums(as.matrix(tdm))
tdm_freq <- data.frame(term = names(freq), occurrences = freq)
tdm_freq
term occurrences
code unigrams code unigrams 2
column dataframe column dataframe 1
column like column like 1
dataframe column dataframe column 2
included code included code 2
...
working on a document classification model using the code provided by Tim DAuria (https://www.youtube.com/watch?v=j1V2McKbkLo), but can not figure out how to actually see the detail analysis of the classification on the 'Test' data.
I am using the model to help classify contracts by type and want to see the specific classification assigned to the different contracts. For example, the model assigns 15 contracts as "x" type of contract. How can I view those 15 file names? The code below works great for the classification piece. Just posting it for reference.
Please help! Really new at this and I'm sure I'm missing something obvious but could not find anything on the web.
Classification Code below:
#int
libs <- c("tm", "plyr","class")
lapply(libs, require, character.only = TRUE)
#set options
options(stringsAsFactors = FALSE)
#set parameters
contract <- c("build construction", "other")
pathname <- ..Desktop/ML/ContractReview"
#clean text
cleanCorpus <- function(corpus) {
corpus.tmp <- tm_map(corpus, removePunctuation)
corpus.tmp <- tm_map(corpus.tmp, stripWhitespace)
corpus.tmp <- tm_map(corpus.tmp, content_transformer(tolower))
corpus.tmp <- tm_map(corpus.tmp, removeWords, stopwords("english"))
corpus.tmp <- tm_map(corpus.tmp, stemDocument)
return(corpus.tmp)
}
#build TDM
generateTDM <- function(contract, path) {
c.dir <- sprintf ("%s/%s", path, contract)
c.cor <- VCorpus(DirSource(directory = c.dir), readerControl = list(reader=readPlain))
c.cor.cl <- cleanCorpus(c.cor)
c.tdm <- TermDocumentMatrix(c.cor.cl)
c.tdm <- removeSparseTerms(c.tdm, .07)
result <- list(name = contract, tdm = c.tdm)
}
tdm <- lapply(contract, generateTDM, path = pathname)
# attach name
bindcontractToTDM <- function(tdm) {
c.mat <-t(data.matrix(tdm[["tdm"]]))
c.df <- as.data.frame(c.mat, stringsAsFactors = FALSE)
c.df <- cbind(c.df, rep(tdm[["name"]], nrow(c.df)))
colnames(c.df) [ncol(c.df)] <- "targetcontract"
return(c.df)
}
contractTDM <- lapply(tdm, bindcontractToTDM)
#stack if you have more than one dataframe
tdm.stack <- do.call(rbind.fill, contractTDM)
tdm.stack[is.na(tdm.stack)] <-0
#hold-out
train.idx <- sample(nrow(tdm.stack), ceiling(nrow(tdm.stack)* 0.7))
test.idx <- (1:nrow(tdm.stack))[- train.idx]
#model - knn
tdm.contract <-tdm.stack[, "targetcontract"]
tdm.stack.nl <- tdm.stack[, !colnames(tdm.stack) %in% "targetcontract"]
knn.pred <- knn(tdm.stack.nl[train.idx, ], tdm.stack.nl[test.idx, ], tdm.contract[train.idx])
#accuracy
conf.mat <- table("predictions"= knn.pred, Actual = tdm.contract[test.idx])
(accuracy <- sum(diag(conf.mat)) / length(test.idx)*100)
The answer to your question is hidden in the knn.pred object, which stores predicted labels for the test cases.
Since there is no input data provided (I don't think it was provided by the author of the video?), I am not sure about the details of the contract classes from your example. However, the output of the knn function from the class package you are using is a factor with labels that the algorithm predicted for the test documents
(you will notice that
length(knn.pred)
is equal to 0.3 * nrow(tdm.stack) ).
If you would like to view/store the predicted label and the actual label for each entry, then you can create a suitable data frame:
label_df <- cbind(label_pred = knn.pred, label_actual = tdm.contract[test.idx])
Alternatively, you can also choose to include the remaining columns of the tdm.contract (if you would like to re-examine the tdm.stack information in that context)
label_df <- cbind(label_pred = knn.pred, label_actual = tdm.contract[test.idx], tdm.stack.nl[test.idx,])
You can then filter either of these data frames to see how your entries of interest have been labelled.
Alternatively, you can choose to run k nearest neighbors algorithm using a function from a different package, in which case the output might differ.
I want to perform Sentiment classification on German dataset, I am using the following code, which works fine with english text, but raising error in case of German text.
Here is my code for the following:
#loading required libraries
library(tm)
library(readxl)
library(data.table)
library(plyr)
library(dplyr)
library(zoo)
library(ggplot2)
library(ranger)
library(e1071)
df<- data.table(read_excel("data/German2datasets.xlsx", skip = 1))
# An abstract function to preprocess a text column
preprocess <- function(text_column)
{
# Use tm to get a doc matrix
corpus <- Corpus(VectorSource(text_column))
# all lower case
corpus <- tm_map(corpus, content_transformer(tolower))
# remove punctuation
corpus <- tm_map(corpus, content_transformer(removePunctuation))
# remove numbers
corpus <- tm_map(corpus, content_transformer(removeNumbers))
# remove stopwords
corpus <- tm_map(corpus, removeWords, stopwords("german"))
# stem document
corpus <- tm_map(corpus, stemDocument)
# strip white spaces (always at the end)
corpus <- tm_map(corpus, stripWhitespace)
# return
corpus
}
# Get preprocess training and test data
corpus <- preprocess(df$TEXT)
# Create a Document Term Matrix for train and test
# Just including bi and tri-grams
Sys.setenv(JAVA_HOME='D://Program Files/Java/jre1.8.0_112') # for 32-bit version
library(rJava)
library(RWeka)
# Bi-Trigram tokenizer function (you can always get longer n-grams)
bitrigramtokeniser <- function(x, n) {
RWeka:::NGramTokenizer(x, RWeka:::Weka_control(min = 2, max = 3))
}
"
Remove remove words <=2
TdIdf weighting
Infrequent (< than 1% of documents) and very frequent (> 80% of documents) terms not included
"
dtm <- DocumentTermMatrix(corpus, control=list(wordLengths=c(2, Inf),
tokenize = bitrigramtokeniser,
weighting = function(x) weightTfIdf(x, normalize = FALSE),
bounds=list(global=c(floor(length(corpus)*0.01), floor(length(corpus)*.8)))))
sent <- df$Sentiment
# Variable selection
# ~~~~~~~~~~~~~~~~~~~~
"
For dimension reduction.
The function calculates chi-square value for each phrase and keeps phrases with highest chi_square values
Ideally you want to put variable selection as part of cross-validation.
chisqTwo function takes:
document term matrix (dtm),
vector of labels (labels), and
number of n-grams you want to keep (n_out)
"
chisqTwo <- function(dtm, labels, n_out=2000){
mat <- as.matrix(dtm)
cat1 <- colSums(mat[labels==T,]) # total number of times phrase used in cat1
cat2 <- colSums(mat[labels==F,]) # total number of times phrase used in cat2
n_cat1 <- sum(mat[labels==T,]) - cat1 # total number of phrases in soft minus cat1
n_cat2 <- sum(mat[labels==F,]) - cat2 # total number of phrases in hard minus cat2
num <- (cat1*n_cat2 - cat2*n_cat1)^2
den <- (cat1 + cat2)*(cat1 + n_cat1)*(cat2 + n_cat2)*(n_cat1 + n_cat2)
chisq <- num/den
chi_order <- chisq[order(chisq)][1:n_out]
mat <- mat[, colnames(mat) %in% names(chi_order)]
}
n <- nrow(dtm)
shuffled <- dtm[sample(n),]
train_dtm <- shuffled[1:round(0.7 * n),]
test_dtm <- shuffled[(round(0.7 * n) + 1):n,]
"
With high dimensional data, test matrix may not have all the phrases training matrix has.
This function fixes that - so that test matrix has the same columns as training.
testmat takes column names of training matrix (train_mat_cols), and
test matrix (test_mat)
and outputs test_matrix with the same columns as training matrix
"
# Test matrix maker
testmat <- function(train_mat_cols, test_mat){
# train_mat_cols <- colnames(train_mat); test_mat <- as.matrix(test_dtm)
test_mat <- test_mat[, colnames(test_mat) %in% train_mat_cols]
miss_names <- train_mat_cols[!(train_mat_cols %in% colnames(test_mat))]
if(length(miss_names)!=0){
colClasses <- rep("numeric", length(miss_names))
df <- read.table(text = '', colClasses = colClasses, col.names = miss_names)
df[1:nrow(test_mat),] <- 0
test_mat <- cbind(test_mat, df)
}
as.matrix(test_mat)
}
# Train and test matrices
train_mat <- chisqTwo(train_dtm, train$Sentiment)
test_mat <- testmat(colnames(train_mat), as.matrix(test_dtm))
dim(train_mat)
dim(test_mat)
n <- nrow(df)
shuffled <- df[sample(n),]
train_data <- shuffled[1:round(0.7 * n),]
test_data <- shuffled[(round(0.7 * n) + 1):n,]
train_mat <- as.data.frame(as.matrix(train_mat))
colnames(train_mat) <- make.names(colnames(train_mat))
train_mat$Sentiment <- train_data$Sentiment
test_mat <- as.data.frame(as.matrix(test_mat))
colnames(test_mat) <- make.names(colnames(test_mat))
test_mat$Sentiment <- test_data$Sentiment
train_mat$Sentiment <- as.factor(train_mat$Sentiment)
test_mat$Sentiment <- as.factor(test_mat$Sentiment)
Then, I will apply caret ML algos on the same for prediction of the Sentiment on the train and test data created.
I am getting the following error at "preprocess" function.
> corpus <- preprocess(df$TEXT)
Show Traceback
Rerun with Debug
Error in FUN(content(x), ...) :
invalid input 'Ich bin seit Jahren zufrieden mit der Basler Versicherubg🌺' in 'utf8towcs'
Data - https://drive.google.com/open?id=1T_LpL2G8upztihAC2SQeVs4YCPH-yfOs
How about trying a different package to get to the pre-Weka etc stages? This is equivalent (and simpler imho):
library("quanteda")
library("readtext")
# reads in the spreadsheet and creates the corpus
germancorp <-
readtext("data/German2datasets.xlsx", text_field = "TEXT")) %>%
corpus()
# does all of the steps of your preprocess() function
dtm <- dfm(germancorp, ngrams = c(2, 3),
tolower = TRUE,
remove_punct = TRUE,
remove_numbers = TRUE,
remove = stopwords("german"),
stem = TRUE)
# remove words with only a single count
dtm <- dfm_trim(dtm, min_count = 2)
# form tf-idf weights - change the base argument from default 10 if you wish
dtm <- dfm_tfidf(dtm)
# if you really want a tm formatted DocumentTermMatrix
convert(dtm, to = "tm")
The quanteda package can do some of what you list as additional steps, although it is not clear exactly what you are doing. (Your question focused on the preprocess() failure so I answered that.)
if you haven´t found the reason yet:
invalid input in 'utf8towcs'
It is the encoding of the file (depending on your [virtual] environment and the current sys-options and of course on the the of saving the file to disk at the time of creation)
A workaround is like:
usableText=str_replace_all(tweets$text,"[^[:graph:]]", " ")
or
your_corpus<- tm_map(your_corpus,toSpace,"[^[:graph:]]")
using R(3.2.5) and with the following packages loaded
'SnowballC', 'tm', 'NLP', 'RWeka', 'RTextTools', 'wordcloud', 'fpc'
carmenCorpus <- Corpus(VectorSource(feedback$Description))
carmenCorpus <- tm_map(carmenCorpus, PlainTextDocument)
carmenCorpus <- tm_map(carmenCorpus, removePunctuation)
carmenCorpus <- tm_map(carmenCorpus, removeWords, stopwords('english'))
carmenCorpus <- tm_map(carmenCorpus, stemDocument)
When I go to create a wordcloud i get the following error. this is a new error, when the code was run several months ago there was no issue:
wordcloud(carmenCorpus, max.words = 100, random.order = FALSE)
# Error in simple_triplet_matrix(i, j, v, nrow = length(terms), ncol = length(corpus), :
# 'i, j' invalid
Please advise on this issue.
wordcloud cannot just take a corpus and magically churn out a wordcloud.
You have to do the hard work of converting it into a TextDocumentMatrix and then summing up the word frequencies:
# convert to TDM
tdm <- TermDocumentMatrix(carmenCorpus, control=list(stemming=True))
# calculate word frequencies
freqs = sort(rowSums(as.matrix(tdm)), decreasing=TRUE)
# plot wordcloud
wordcloud(names(freqs), freqs,
max.words = 100,
random.order = FALSE,
# any other params you want to pass into wordcloud
)
I'm using a support vector machine for my document classification task! it classifies all my Articles in the training-set, but fails to classify the ones in my test-set!
trainDTM is the document term matrix of my training-set. testDTM is the one for the test-set.
here's my (not so beautiful) code:
# create data.frame with labelled sentences
labeled <- as.data.frame(read.xlsx("C:\\Users\\LABELED.xlsx", 1, header=T))
# create training set and test set
traindata <- as.data.frame(labeled[1:700,c("ARTICLE","CLASS")])
testdata <- as.data.frame(labeled[701:1000, c("ARTICLE","CLASS")])
# Vector, Source Transformation
trainvector <- as.vector(traindata$"ARTICLE")
testvector <- as.vector(testdata$"ARTICLE")
trainsource <- VectorSource(trainvector)
testsource <- VectorSource(testvector)
# CREATE CORPUS FOR DATA
traincorpus <- Corpus(trainsource)
testcorpus <- Corpus(testsource)
# my own stopwords
sw <- c("i", "me", "my")
## CLEAN TEXT
# FUNCTION FOR CLEANING
cleanCorpus <- function(corpus){
corpus.tmp <- tm_map(corpus, removePunctuation)
corpus.tmp <- tm_map(corpus.tmp,stripWhitespace)
corpus.tmp <- tm_map(corpus.tmp,tolower)
corpus.tmp <- tm_map(corpus.tmp, removeWords, sw)
corpus.tmp <- tm_map(corpus.tmp, removeNumbers)
corpus.tmp <- tm_map(corpus.tmp, stemDocument, language="en")
return(corpus.tmp)}
# CLEAN CORP WITH ABOVE FUNCTION
traincorpus.cln <- cleanCorpus(traincorpus)
testcorpus.cln <- cleanCorpus(testcorpus)
## CREATE N-GRAM DOCUMENT TERM MATRIX
# CREATE N-GRAM TOKENIZER
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
# CREATE DTM
trainmatrix.cln.bi <- DocumentTermMatrix(traincorpus.cln, control = list(tokenize = BigramTokenizer))
testmatrix.cln.bi <- DocumentTermMatrix(testcorpus.cln, control = list(tokenize = BigramTokenizer))
# REMOVE SPARSE TERMS
trainDTM <- removeSparseTerms(trainmatrix.cln.bi, 0.98)
testDTM <- removeSparseTerms(testmatrix.cln.bi, 0.98)
# train the model
SVM <- svm(as.matrix(trainDTM), as.factor(traindata$CLASS))
# get classifications for training-set
results.train <- predict(SVM, as.matrix(trainDTM)) # works fine!
# get classifications for test-set
results <- predict(SVM,as.matrix(testDTM))
Error in scale.default(newdata[, object$scaled, drop = FALSE], center = object$x.scale$"scaled:center", :
length of 'center' must equal the number of columns of 'x'
i don't understand this error. and what is 'center' ?
thank you!!
Train and test data must be in the same features space ; building two separates DTM in that way can't work.
A solution with using RTextTools :
DocTermMatrix <- create_matrix(labeled, language="english", removeNumbers=TRUE, stemWords=TRUE, ...)
container <- create_container(DocTermMatrix, labels, trainSize=1:700, testSize=701:1000, virgin=FALSE)
models <- train_models(container, "SVM")
results <- classify_models(container, models)
Or, to answer your question (with e1071), you can specify the vocabulary ('features') in the projection (DocumentTermMatrix) :
DocTermMatrixTrain <- DocumentTermMatrix(Corpus(VectorSource(trainDoc)));
Features <- DocTermMatrixTrain$dimnames$Terms;
DocTermMatrixTest <- DocumentTermMatrix(Corpus(VectorSource(testDoc)),control=list(dictionary=Features));