Empty term document matrix - r

I seem to run into a problem whenever I try to inspect my freq. words and associations.
When I make the tdm I get this info:
TermDocumentMatrix
I can see I have plenty of terms to use, in plenty of documents.
However!
When I try to inspect the content of "tdm", I get this info:
Inspecting the TDM
Howcome the tdm all of a sudden is empty?
Hope someone can help
tweets <- userTimeline("RDataMining", n = 1000)
(n.tweet <- length(tweets))
tweets[1:3]
#convert tweets to a data frame
tweets.df <- twListToDF(tweets)
dim(tweets.df)
##Text cleaning
library(tm)
#build a corpus and specify the source to be a character vector
myCorpus <- Corpus(VectorSource(tweets.df$text))
#convert to lower case
myCorpus <- tm_map(myCorpus, content_transformer(tolower))
#remove URLs
removeURL <- function(x) gsub ("http[^[:space:]]*","",x)
myCorpus <- tm_map(myCorpus,content_transformer(removeURL))
#remove anything other than English letters or space
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*","",x)
myCorpus <- tm_map(myCorpus,content_transformer(removeNumPunct))
#remove stopwords + 2
myStopwords <- c(stopwords('english'),"available","via")
#remove "r" and "big" from stopwords
myStopwords <- setdiff(myStopwords, c("r","big"))
#remove stopwords from corpus
myCorpus <- tm_map(myCorpus,removeWords,myStopwords)
#remove extra whitespace
myCorpus <- tm_map(myCorpus, stripWhitespace)
#keep a copy of corpus to use later as a dictionary for stem completion
myCorpusCopy <- myCorpus
#stem words
library(SnowballC)
myCorpus <- tm_map(myCorpus,stemDocument)
stemCompletion2 <- function(x,dictionary) {
x <- unlist(strsplit(as.character(x),""))
#because stemCompletion completes an empty string to a word in dict. Remove empty string to avoid this
x <- x[x !=""]
x <- stemCompletion(x, dictionary = dictionary)
x <- paste (x,sep = "",collapse = "")
PlainTextDocument(stripWhitespace(x))
}
myCorpus <- lapply(myCorpus, stemCompletion2, dictionary = myCorpusCopy)
myCorpus <- Corpus(VectorSource(myCorpus))
#count freq of "mining"
miningCases <- lapply(myCorpusCopy,
function(x) {grep(as.character(x),pattern = "\\<mining")})
sum(unlist(miningCases))
#count freq of "miner"
miningCases <- lapply(myCorpusCopy,
function(x) {grep(as.character(x),pattern = "\\<miner")})
sum(unlist(miningCases))
#count freq of "r"
miningCases <- lapply(myCorpusCopy,
function(x) {grep(as.character(x),pattern = "\\<r")})
sum(unlist(miningCases))
#replace "miner" with "mining"
myCorpus <- tm_map(myCorpus,content_transformer(gsub),
pattern = "miner", replacement = "mining")
tdm <- TermDocumentMatrix(myCorpus, control = list(removePunctuation = TRUE,stopwords = TRUE))
tdm
##Freq words and associations
idx <- which(dimnames(tdm)$Terms == "r")
inspect(tdm[idx + (0:5), 101:110])
#inspect frequent words
(freq.terms <- findFreqTerms(tdm, lowfreq = 15))
term.freq <- rowSums(as.matrix(tdm))
term.freq <- subset(term.freq,term.freq >= 15)
df <- data.frame(term = names(term.freq), freq = term.freq)

I've been using the following Twitter query to test your code:
tweets = searchTwitter("r data mining", n=10)
and I think the problem is with your function stemCompletion2, which should look something like this:
stemCompletion2 <- function(x,dictionary) {
x <- unlist(strsplit(as.character(x)," "))
print("before:")
print(x)
#because stemCompletion completes an empty string to a word in dict. Remove empty string to avoid this
x <- x[x !=""]
x <- stemCompletion(x, dictionary = dictionary)
print("after:")
print(x)
x <- paste(x, sep = " ")
PlainTextDocument(stripWhitespace(x))
}
The modifications are as follows: before you had
x <- unlist(strsplit(as.character(x),""))
which was creating a list with all the characters of in each of the documents, and I've modified it to
x <- unlist(strsplit(as.character(x)," "))
to create a list of words. Similarly, when recomposing your documents, you where doing
x <- paste (x,sep = "",collapse = "")
which was creating the long strings you mention in your post, and I've modified it to:
x <- paste(x, sep = " ")
to recompose the words.
One example of the completions would be for my data:
[1] "before:"
[1] "rt" "ebookdealalert" "r" "datamin" "project" "learn" "data" "mine"
[9] "realworld" "project" "book" "solv" "predict" "model"
[1] "after:"
rt ebookdealalert r datamin project learn data mine
"rt" "ebookdealalerts" "r" "datamining" "projects" "learn" "data" ""
realworld project book solv predict model
"realworld" "projects" "book" "solve" "predictive" "modeling"
After that step, you may be able to work with TermDocumentMatrix as expected.
Hope it helps.

Related

R: tm package on German text

I want to perform Sentiment classification on German dataset, I am using the following code, which works fine with english text, but raising error in case of German text.
Here is my code for the following:
#loading required libraries
library(tm)
library(readxl)
library(data.table)
library(plyr)
library(dplyr)
library(zoo)
library(ggplot2)
library(ranger)
library(e1071)
df<- data.table(read_excel("data/German2datasets.xlsx", skip = 1))
# An abstract function to preprocess a text column
preprocess <- function(text_column)
{
# Use tm to get a doc matrix
corpus <- Corpus(VectorSource(text_column))
# all lower case
corpus <- tm_map(corpus, content_transformer(tolower))
# remove punctuation
corpus <- tm_map(corpus, content_transformer(removePunctuation))
# remove numbers
corpus <- tm_map(corpus, content_transformer(removeNumbers))
# remove stopwords
corpus <- tm_map(corpus, removeWords, stopwords("german"))
# stem document
corpus <- tm_map(corpus, stemDocument)
# strip white spaces (always at the end)
corpus <- tm_map(corpus, stripWhitespace)
# return
corpus
}
# Get preprocess training and test data
corpus <- preprocess(df$TEXT)
# Create a Document Term Matrix for train and test
# Just including bi and tri-grams
Sys.setenv(JAVA_HOME='D://Program Files/Java/jre1.8.0_112') # for 32-bit version
library(rJava)
library(RWeka)
# Bi-Trigram tokenizer function (you can always get longer n-grams)
bitrigramtokeniser <- function(x, n) {
RWeka:::NGramTokenizer(x, RWeka:::Weka_control(min = 2, max = 3))
}
"
Remove remove words <=2
TdIdf weighting
Infrequent (< than 1% of documents) and very frequent (> 80% of documents) terms not included
"
dtm <- DocumentTermMatrix(corpus, control=list(wordLengths=c(2, Inf),
tokenize = bitrigramtokeniser,
weighting = function(x) weightTfIdf(x, normalize = FALSE),
bounds=list(global=c(floor(length(corpus)*0.01), floor(length(corpus)*.8)))))
sent <- df$Sentiment
# Variable selection
# ~~~~~~~~~~~~~~~~~~~~
"
For dimension reduction.
The function calculates chi-square value for each phrase and keeps phrases with highest chi_square values
Ideally you want to put variable selection as part of cross-validation.
chisqTwo function takes:
document term matrix (dtm),
vector of labels (labels), and
number of n-grams you want to keep (n_out)
"
chisqTwo <- function(dtm, labels, n_out=2000){
mat <- as.matrix(dtm)
cat1 <- colSums(mat[labels==T,]) # total number of times phrase used in cat1
cat2 <- colSums(mat[labels==F,]) # total number of times phrase used in cat2
n_cat1 <- sum(mat[labels==T,]) - cat1 # total number of phrases in soft minus cat1
n_cat2 <- sum(mat[labels==F,]) - cat2 # total number of phrases in hard minus cat2
num <- (cat1*n_cat2 - cat2*n_cat1)^2
den <- (cat1 + cat2)*(cat1 + n_cat1)*(cat2 + n_cat2)*(n_cat1 + n_cat2)
chisq <- num/den
chi_order <- chisq[order(chisq)][1:n_out]
mat <- mat[, colnames(mat) %in% names(chi_order)]
}
n <- nrow(dtm)
shuffled <- dtm[sample(n),]
train_dtm <- shuffled[1:round(0.7 * n),]
test_dtm <- shuffled[(round(0.7 * n) + 1):n,]
"
With high dimensional data, test matrix may not have all the phrases training matrix has.
This function fixes that - so that test matrix has the same columns as training.
testmat takes column names of training matrix (train_mat_cols), and
test matrix (test_mat)
and outputs test_matrix with the same columns as training matrix
"
# Test matrix maker
testmat <- function(train_mat_cols, test_mat){
# train_mat_cols <- colnames(train_mat); test_mat <- as.matrix(test_dtm)
test_mat <- test_mat[, colnames(test_mat) %in% train_mat_cols]
miss_names <- train_mat_cols[!(train_mat_cols %in% colnames(test_mat))]
if(length(miss_names)!=0){
colClasses <- rep("numeric", length(miss_names))
df <- read.table(text = '', colClasses = colClasses, col.names = miss_names)
df[1:nrow(test_mat),] <- 0
test_mat <- cbind(test_mat, df)
}
as.matrix(test_mat)
}
# Train and test matrices
train_mat <- chisqTwo(train_dtm, train$Sentiment)
test_mat <- testmat(colnames(train_mat), as.matrix(test_dtm))
dim(train_mat)
dim(test_mat)
n <- nrow(df)
shuffled <- df[sample(n),]
train_data <- shuffled[1:round(0.7 * n),]
test_data <- shuffled[(round(0.7 * n) + 1):n,]
train_mat <- as.data.frame(as.matrix(train_mat))
colnames(train_mat) <- make.names(colnames(train_mat))
train_mat$Sentiment <- train_data$Sentiment
test_mat <- as.data.frame(as.matrix(test_mat))
colnames(test_mat) <- make.names(colnames(test_mat))
test_mat$Sentiment <- test_data$Sentiment
train_mat$Sentiment <- as.factor(train_mat$Sentiment)
test_mat$Sentiment <- as.factor(test_mat$Sentiment)
Then, I will apply caret ML algos on the same for prediction of the Sentiment on the train and test data created.
I am getting the following error at "preprocess" function.
> corpus <- preprocess(df$TEXT)
Show Traceback
Rerun with Debug
Error in FUN(content(x), ...) :
invalid input 'Ich bin seit Jahren zufrieden mit der Basler Versicherubg🌺' in 'utf8towcs'
Data - https://drive.google.com/open?id=1T_LpL2G8upztihAC2SQeVs4YCPH-yfOs
How about trying a different package to get to the pre-Weka etc stages? This is equivalent (and simpler imho):
library("quanteda")
library("readtext")
# reads in the spreadsheet and creates the corpus
germancorp <-
readtext("data/German2datasets.xlsx", text_field = "TEXT")) %>%
corpus()
# does all of the steps of your preprocess() function
dtm <- dfm(germancorp, ngrams = c(2, 3),
tolower = TRUE,
remove_punct = TRUE,
remove_numbers = TRUE,
remove = stopwords("german"),
stem = TRUE)
# remove words with only a single count
dtm <- dfm_trim(dtm, min_count = 2)
# form tf-idf weights - change the base argument from default 10 if you wish
dtm <- dfm_tfidf(dtm)
# if you really want a tm formatted DocumentTermMatrix
convert(dtm, to = "tm")
The quanteda package can do some of what you list as additional steps, although it is not clear exactly what you are doing. (Your question focused on the preprocess() failure so I answered that.)
if you haven´t found the reason yet:
invalid input in 'utf8towcs'
It is the encoding of the file (depending on your [virtual] environment and the current sys-options and of course on the the of saving the file to disk at the time of creation)
A workaround is like:
usableText=str_replace_all(tweets$text,"[^[:graph:]]", " ")
or
your_corpus<- tm_map(your_corpus,toSpace,"[^[:graph:]]")

A lemmatizing function using a hash dictionary does not work with tm package in R

I would like to lemmatize Polish text using a large external dictionary (format like in txt variable below). I am not lucky, to have an option Polish with popular text mining packages. The answer https://stackoverflow.com/a/45790325/3480717 by #DmitriySelivanov works well with simple vector of texts. (I have also removed Polish diacritics from both the dictionary and corpus.) The function works well with a vector of texts.
Unfortunately it does not work with the corpus format generated by tm. Let me paste Dmitriy's code:
library(hashmap)
library(data.table)
txt =
"Abadan Abadanem
Abadan Abadanie
Abadan Abadanowi
Abadan Abadanu
abadańczyk abadańczycy
abadańczyk abadańczykach
abadańczyk abadańczykami
"
dt = fread(txt, header = F, col.names = c("lemma", "word"))
lemma_hm = hashmap(dt$word, dt$lemma)
lemma_hm[["Abadanu"]]
#"Abadan"
lemma_tokenizer = function(x, lemma_hashmap,
tokenizer = text2vec::word_tokenizer) {
tokens_list = tokenizer(x)
for(i in seq_along(tokens_list)) {
tokens = tokens_list[[i]]
replacements = lemma_hashmap[[tokens]]
ind = !is.na(replacements)
tokens_list[[i]][ind] = replacements[ind]
}
tokens_list
}
texts = c("Abadanowi abadańczykach OutOfVocabulary",
"abadańczyk Abadan OutOfVocabulary")
lemma_tokenizer(texts, lemma_hm)
#[[1]]
#[1] "Abadan" "abadańczyk" "OutOfVocabulary"
#[[2]]
#[1] "abadańczyk" "Abadan" "OutOfVocabulary"
now I would like to apply it on tm corpus "docs" here is an example syntax I would use with tm package, on tm generated corpus.
docs <- tm_map(docs, function(x) lemma_tokenizer(x, lemma_hashmap="lemma_hm"))
another syntax that I tried:
LemmaTokenizer <- function(x) lemma_tokenizer(x, lemma_hashmap="lemma_hm")
docsTDM <-
DocumentTermMatrix(docs, control = list(wordLengths = c(4, 25), tokenize=LemmaTokenizer))
It throws at me an error:
Error in lemma_hashmap[[tokens]] :
attempt to select more than one element in vectorIndex
The function works with a vector of texts but it will not work with tm corpus. Thanks in advance for suggestions (even use of this function with other text mining package if it will not work with tm).
I see two problems here. 1) your custom function returns a list, while it should return a vector of strings; and 2) you are passing a wrong lemma_hashmap argument.
A quick workaround to fix the first problem is to use paste() and sapply() before returning the function result.
lemma_tokenizer = function(x, lemma_hashmap,
tokenizer = text2vec::word_tokenizer) {
tokens_list = tokenizer(x)
for(i in seq_along(tokens_list)) {
tokens = tokens_list[[i]]
replacements = lemma_hashmap[[tokens]]
ind = !is.na(replacements)
tokens_list[[i]][ind] = replacements[ind]
}
# paste together, return a vector
sapply(tokens_list, (function(i){paste(i, collapse = " ")}))
}
We can run the same example of your post.
texts = c("Abadanowi abadańczykach OutOfVocabulary",
"abadańczyk Abadan OutOfVocabulary")
lemma_tokenizer(texts, lemma_hm)
[1] "Abadan abadańczyk OutOfVocabulary" "abadańczyk Abadan OutOfVocabulary"
Now, we can use tm_map. Just make sure to use lemma_hm (i.e., the variable) and not "lemma_hm" (a string) as argument.
docs <- SimpleCorpus(VectorSource(texts))
out <- tm_map(docs, (function(x) {lemma_tokenizer(x, lemma_hashmap=lemma_hm)}))
out[[1]]$content
[1] "Abadan abadańczyk OutOfVocabulary"
For polish lemmatization please refer to this script
https://github.com/MarcinKosinski/trigeR5/blob/master/R/lematyzacja.R that uses this polmorfologik dictionary https://github.com/MarcinKosinski/trigeR5/tree/master/dicts (and also stop words are included there).
Try using quanteda's dictionary() function, after creating a dictionary mapping each variant as a dictionary value, to the lemma as a dictionary key. Below, it looks up your values and then pastes the tokens back into a text. (If you wanted tokens, you would not need the last paste() operation.
txt <-
"Abadan Abadanem
Abadan Abadanie
Abadan Abadanowi
Abadan Abadanu
abadańczyk abadańczycy
abadańczyk abadańczykach
abadańczyk abadańczykami"
list_temp <- strsplit(readLines(textConnection(txt)), "\\s+")
list_temp2 <- lapply(list_temp, "[", 2)
names(list_temp2) <- sapply(list_temp, "[", 1)
library("quanteda")
polish_lemma_dict <- dictionary(list_temp2)
# Dictionary object with 7 key entries.
# - Abadan:
# - abadanem
# - Abadan:
# - abadanie
# - Abadan:
# - abadanowi
# - Abadan:
# - abadanu
# - abadańczyk:
# - abadańczycy
# - abadańczyk:
# - abadańczykach
# - abadańczyk:
# - abadańczykami
texts <- c("Abadanowi abadańczykach OutOfVocabulary",
"abadańczyk Abadan OutOfVocabulary")
The texts can now be converted into tokens, and use quanteda's tokens_lookup() function to replace the dictionary values (inflected words) with the dictionary keys (lemmas). In the last step, I've pasted the tokens back together, which you can skip if you want tokens and not a full text.
require(magrittr)
texts %>%
tokens() %>%
tokens_lookup(dictionary = polish_lemma_dict, exclusive = FALSE, capkeys = FALSE) %>%
as.character() %>%
paste(collapse = " ")
# [1] "Abadan abadańczyk OutOfVocabulary abadańczyk Abadan OutOfVocabulary"
Here is the complete imperfect code I used the answer in. Credits to many people, I described all sources on the bottom. It is very rough, I realise, but it catches mise for me, ie. I can use txt lemmes dictionary and my stopwords to classify Polish texts. Thanks to Damiano Fantini, Dmitriy Selivanov and many others.
#----1. Set up. ----
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
library(readtext)
library(tm)
library(proxy)
library(stringi)
library(stringr)
library(hashmap)
library(data.table)
library(text2vec)
# For reading n-grams
library(RWeka) #(*)
BigramTokenizer <-
function(x) NGramTokenizer(x, Weka_control(min = 1, max = 3)) #(*)
#----2. Read data. ----
stopwordsPL <- as.vector(str_split(readLines("polish.stopwords.text",encoding = "UTF-8"), pattern = " ",simplify = T))
docs <- VCorpus(DirSource(pattern="txt"))
titles <- rownames(summary(docs))
docs <- tm_map(docs, removeWords, words=stopwordsPL)
docs <- tm_map(docs, tolower)
docs <- tm_map(docs, function(x) stri_trans_general(x, "Latin-ASCII"))
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, stripWhitespace)
# for English texts it would be simpler
# docs <- tm_map(docs, removeWords, stopwords("english")) #can add other words to remove
# docs <- tm_map(docs, stemDocument, "english")
#====3. Lemmatize ====
# # Dictionary from http://www.lexiconista.com/datasets/lemmatization/
# lemmadict_file = "lemmatization-pl.text"
# dt = fread(file= lemmadict_file, header = F, col.names = c("lemma", "word"), data.table=T, encoding="UTF-8")
# # I threw away Polish letters, maybe changing locales may help.
# dt$lemma <- stri_trans_general(dt$lemma, "Latin-ASCII;lower")
# dt$word <- stri_trans_general(dt$word, "Latin-ASCII;lower")
# dt <- unique(dt)
#
# # Creating hash dictionary
# lemma_hm = hashmap(dt$word, dt$lemma)
#
# # Test if it works
# lemma_hm[["mnozyl"]]
# # [1] "mnozyc"
#
# save_hashmap(lemma_hm, file="lemma_hm", overwrite = TRUE, compress = TRUE)
lemma_hm <- load_hashmap(file="lemma_hm")
lemma_tokenizer = function(x, lemma_hashmap,
tokenizer = text2vec::word_tokenizer) {
tokens_list = tokenizer(x)
for(i in seq_along(tokens_list)) {
tokens = tokens_list[[i]]
replacements = lemma_hashmap[[tokens]]
ind = !is.na(replacements)
tokens_list[[i]][ind] = replacements[ind]
}
# paste together, return a vector
sapply(tokens_list, (function(i){paste(i, collapse = " ")}))
}
docs <- tm_map(docs, (function(x) {lemma_tokenizer(x, lemma_hashmap=lemma_hm)}))
docs <- tm_map(docs, PlainTextDocument)
#====4. Create document term matrix====
docsTDM <-
DocumentTermMatrix(docs, control = list(wordLengths = c(5, 25),tokenize = BigramTokenizer)) # tokenize=LemmaTokenizer, tokenize = BigramTokenizer (*)
docsTDM$dimnames
#====5. Remove sparse and common words====
docsTDM <- removeSparseTerms(docsTDM, .90)
# https://stackoverflow.com/questions/25905144/removing-overly-common-words-occur-in-more-than-80-of-the-documents-in-r
removeCommonTerms <- function (x, pct)
{
stopifnot(inherits(x, c("DocumentTermMatrix", "TermDocumentMatrix")),
is.numeric(pct), pct > 0, pct < 1)
m <- if (inherits(x, "DocumentTermMatrix"))
t(x)
else x
t <- table(m$i) < m$ncol * (pct)
termIndex <- as.numeric(names(t[t]))
if (inherits(x, "DocumentTermMatrix"))
x[, termIndex]
else x[termIndex, ]
}
docsTDM <-
removeCommonTerms(docsTDM, .8) #remove terms that are in >=80% of the documents
docsTDM$dimnames
#====6. Cluster data (hclust). ====
docsdissim <- dist(as.matrix(docsTDM), method = "cosine")
docsdissim2 <- as.matrix(docsdissim)
dim(docsdissim2)
rownames(docsdissim2) <- titles
colnames(docsdissim2) <- titles
h <- hclust(docsdissim, method = "ward.D2")
plot(h, labels = titles, sub = "")
# Library hclust with p-values (pvclust)
library(pvclust)
h_pv <- pvclust(docsdissim2, method.hclust = "ward.D2", method.dist ="correlation")
plot(h_pv)
data.frame(cutree(tree = h_pv$hclust, k = 4))
# pvclust provides two types of p-values: AU (Approximately Unbiased) p-value and BP (Bootstrap Probability) value.
# AU p-value, which is computed by multiscale bootstrap resampling, is a better approximation to unbiased p-value
# than BP value computed by normal bootstrap resampling.
# AU p-value > 0.95 we can assume the clusters exist and may stably be
# observed if we increase the number of observations.
# (http://stat.sys.i.kyoto-u.ac.jp/prog/pvclust/)
#==== Literature:====
# Original article:
# http://www.rexamine.com/2014/06/text-mining-in-r-automatic-categorization-of-wikipedia-articles/
# Updates to make it work after some functions became obsolete:
# https://stackoverflow.com/questions/34423823/r-automatic-categorization-of-wikipedia-articles
# https://stackoverflow.com/questions/34372166/error-using-termdocumentmatrix-and-dist-functions-in-r
#
# Based on that:
# http://brazenly.blogspot.co.uk/2015/02/r-categorization-clustering-of.html
#
# Sparse terms:
# https://stackoverflow.com/questions/28763389/how-does-the-removesparseterms-in-r-work
# Lemmatizing function:
# https://stackoverflow.com/questions/46122591/a-lemmatizing-function-using-a-hash-dictionary-does-not-work-with-tm-package-in
# https://stackoverflow.com/questions/45762559/lemmatization-using-txt-file-with-lemmes-in-r/45790325#45790325

Find 2 words phrase using tm R

I know this has been asked multiple times. For example
Finding 2 & 3 word Phrases Using R TM Package
However, I don't know why none of these solutions work with my data. The result is always one-gram word no matter how many ngram I chose (2, 3 or 4) for the ngram.
Could anybody know the reason why? I suspect the encoding is the reason.
Edited: a small part of the data.
comments <- c("Merge branch 'master' of git.internal.net:/git/live/LegacyCodebase into problem_70918\n",
"Merge branch 'master' of git.internal.net:/git/live/LegacyCodebase into tm-247\n",
"Merge branch 'php5.3-upgrade-sprint6-7' of git.internal.net:/git/pn-project/LegacyCodebase into release2012.08\n",
"Merge remote-tracking branch 'dmann1/p71148-s3-callplan_mapping' into lcst-operational-changes\n",
"Merge branch 'master' of git.internal.net:/git/live/LegacyCodebase into TASK-360148\n",
"Merge remote-tracking branch 'grockett/rpr-pre' into rpr-lite\n"
)
cleanCorpus <- function(vector){
corpus <- Corpus(VectorSource(vector), readerControl = list(language = "en_US"))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, tolower)
#corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removePunctuation)
#corpus <- tm_map(corpus, PlainTextDocument)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
return(corpus)
}
# this function is provided by a team member (in the link I posted above)
test <- function(keywords_doc){
BigramTokenizer <- function(x)
unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
# creating of document matrix
keywords_matrix <- TermDocumentMatrix(keywords_doc, control = list(tokenize = BigramTokenizer))
# remove sparse terms
keywords_naremoval <- removeSparseTerms(keywords_matrix, 0.99)
# Frequency of the words appearing
keyword.freq <- rowSums(as.matrix(keywords_naremoval))
subsetkeyword.freq <-subset(keyword.freq, keyword.freq >=20)
frequentKeywordSubsetDF <- data.frame(term = names(subsetkeyword.freq), freq = subsetkeyword.freq)
# Sorting of the words
frequentKeywordDF <- data.frame(term = names(keyword.freq), freq = keyword.freq)
frequentKeywordSubsetDF <- frequentKeywordSubsetDF[with(frequentKeywordSubsetDF, order(-frequentKeywordSubsetDF$freq)), ]
frequentKeywordDF <- frequentKeywordDF[with(frequentKeywordDF, order(-frequentKeywordDF$freq)), ]
# Printing of the words
# wordcloud(frequentKeywordDF$term, freq=frequentKeywordDF$freq, random.order = FALSE, rot.per=0.35, scale=c(5,0.5), min.freq = 30, colors = brewer.pal(8,"Dark2"))
return(frequentKeywordDF)
}
corpus <- cleanCorpus(comments)
t <- test(corpus)
> head(t)
term freq
added added 6
html html 6
tracking tracking 6
common common 4
emails emails 4
template template 4
Thanks,
I haven't found the reason either, but if you are only interested in the counts regardless in which documents the bigrams occured, you could get them alternatively via this pipeline:
library(tm)
lilbrary(dplyr)
library(quanteda)
# ..construct the corpus as in your post ...
corpus %>%
unlist() %>%
tokens() %>%
tokens_ngrams(2:2, concatenator = " ") %>%
unlist() %>%
as.data.frame() %>%
group_by_(".") %>%
summarize(cnt=n()) %>%
arrange(desc(cnt))

R comparison.cloud wordcloud error when empty corpus

I'm using comparison.cloud() package over dynamically served data, which works fine. Then it often occurs that some corpus' element (document) is empty. This makes comparison.cloud return an error.
So far I've made a loop that does not include empty subsets:
quests <- c("Q1","Q2","Q3")
wordslist <- list()
rmlist <- numeric()
count <- 1
for (quest in quests) {
wordslist[[quest]] <- subset(df$element, df$question == quest)
if (length(wordslist[[quest]]) == 0) {
rmlist <- c(rmlist, count)
wordslist[quest] <- NULL
}
count <- 1 + count
}
Then later on after having built the corpus I use the counter to remove the colnames :
tra$question <- tra$question[c(1,2,3)]
if (length(rmlist) != 0) {tra$question <- tra$question[-rmlist]}
So this works but I wanted to ask if it can somehow be improved, especially if there is any parameter I can feed to comparison cloud to help with this purpose. Also, is this behaviour a bug in comparison cloud ?
The results in my script are that if a question has no answers, this question won't be shown in the wordcloud. It would be much more preferable to have an empty question (the question title with its empty part of the wordcloud) than to have to remove it. By the way I tried sucessively to add :
An "empty_Qn" character, result : it is at the center, ugly, big,
draws all the attention
The repetition of "empty_Qn" but the repetition is cleaned out by
tm() and we again have the ugly output from above
An empty string "" or " " but here again it was stripped out, and
I again had this error in comparison cloud due to an empty corpus
part.
EDIT : whole script
## Elements per question wordcloud
library(wordcloud)
library(tm)
load(file="mygroupAnswers.Rda")
df <- df[df$group == groupname,]
df <- droplevels(df)
# Stop if no data
if(length(df$elements)==0) q(save="no")
quests <- c("Q1","Q2","Q3")
wordslist <- list()
rmlist <- numeric()
count <- 1
for (quest in quests) {
wordslist[[quest]] <- subset(df$element, df$question == quest)
if (length(wordslist[[quest]]) == 0) {
rmlist <- c(rmlist, count)
wordslist[quest] <- NULL
}
count <- 1 + count
}
corpus <- Corpus(VectorSource(wordslist), readerControl = list(language = lang)) # Live Web version
#corpus <- Corpus(VectorSource(wordALL), readerControl = list(language = "fr")) # RStudio version
corpus <- tm_map(corpus, content_transformer(tolower))
spStopWords <- c(stopwords(tra2), "l'", "j'", "d'", "c'", "qu'", "quand", "avoir", "être", "etre", "quelqu'un", "plus", "tant", "bien", "mal") # Live Web version
spStopWords <- c(stopwords("french"), "l'", "j'", "d'", "c'", "qu'", "quand", "avoir", "être", "etre", "quelqu'un", "plus", "tant", "bien", "mal") # RStudio version
corpus <- tm_map(corpus, removeWords, spStopWords)
corpus <- tm_map(corpus, removePunctuation)
tdm <- TermDocumentMatrix(corpus)
tdm <- as.matrix(tdm)
tra$question <- tra$question[c(1,2,3)]
if (length(rmlist) != 0) {tra$question <- tra$question[-rmlist]}
colnames(tdm) <- tra$question
comparison.cloud(tdm, max.words=800, scale=c(6,1), title.size=1.2)

R - Automatic categorization of Wikipedia articles

I have been trying to follow this example by Norbert Ryciak, whom I havent been able to get in touch with.
Since this article was written in 2014, some things in R have changed so I have been able to update some of those things in the code, but I got stuck in the last part.
Here is my Working code so far:
library(tm)
library(stringi)
library(proxy)
wiki <- "https://en.wikipedia.org/wiki/"
titles <- c("Integral", "Riemann_integral", "Riemann-Stieltjes_integral", "Derivative",
"Limit_of_a_sequence", "Edvard_Munch", "Vincent_van_Gogh", "Jan_Matejko",
"Lev_Tolstoj", "Franz_Kafka", "J._R._R._Tolkien")
articles <- character(length(titles))
for (i in 1:length(titles)) {
articles[i] <- stri_flatten(readLines(stri_paste(wiki, titles[i])), col = " ")
}
docs <- Corpus(VectorSource(articles))
docs[[1]]
docs2 <- tm_map(docs, function(x) stri_replace_all_regex(x, "<.+?>", " "))
docs3 <- tm_map(docs2, function(x) stri_replace_all_fixed(x, "\t", " "))
docs4 <- tm_map(docs3, PlainTextDocument)
docs5 <- tm_map(docs4, stripWhitespace)
docs6 <- tm_map(docs5, removeWords, stopwords("english"))
docs7 <- tm_map(docs6, removePunctuation)
docs8 <- tm_map(docs7, content_transformer(tolower))
docs8[[1]]
docsTDM <- TermDocumentMatrix(docs8)
docsTDM2 <- as.matrix(docsTDM)
docsdissim <- dist(docsTDM2, method = "cosine")
But I havent been able to get pass this part:
docsdissim2 <- as.matrix(docsdissim)
rownames(docsdissim2) <- titles
colnames(docsdissim2) <- titles
docsdissim2
h <- hclust(docsdissim, method = "ward.D")
plot(h, labels = titles, sub = "")
I tried to run the "hclust" directly, and then I was able to Plot, but nothing readable came out of it.
This are the errors Im getting:
rownames(docsdissim2) <- titles
Error in `rownames<-`(`*tmp*`, value = c("Integral", "Riemann_integral", :
length of 'dimnames' [1] not equal to array extent
Another:
plot(h, labels = titles, sub = "")
Error in graphics:::plotHclust(n1, merge, height, order(x$order), hang, :
invalid dendrogram input
Is there anyone that could give me a hand to finish this example?
Best Regards,
I was able to solve this problem thanks to Norbert Ryciak (the author of the tutorial).
Since he used an older version of "tm" (which was probably the latest at the time) it was not compatible with the one I used.
The solution was to replace "docsTDM <- TermDocumentMatrix(docs8)" with "docsTDM <- DocumentTermMatrix(docs8)".
So the final code:
library(tm)
library(stringi)
library(proxy)
wiki <- "https://en.wikipedia.org/wiki/"
titles <- c("Integral", "Riemann_integral", "Riemann-Stieltjes_integral", "Derivative",
"Limit_of_a_sequence", "Edvard_Munch", "Vincent_van_Gogh", "Jan_Matejko",
"Lev_Tolstoj", "Franz_Kafka", "J._R._R._Tolkien")
articles <- character(length(titles))
for (i in 1:length(titles)) {
articles[i] <- stri_flatten(readLines(stri_paste(wiki, titles[i])), col = " ")
}
docs <- Corpus(VectorSource(articles))
docs[[1]]
docs2 <- tm_map(docs, function(x) stri_replace_all_regex(x, "<.+?>", " "))
docs3 <- tm_map(docs2, function(x) stri_replace_all_fixed(x, "\t", " "))
docs4 <- tm_map(docs3, PlainTextDocument)
docs5 <- tm_map(docs4, stripWhitespace)
docs6 <- tm_map(docs5, removeWords, stopwords("english"))
docs7 <- tm_map(docs6, removePunctuation)
docs8 <- tm_map(docs7, content_transformer(tolower))
docs8[[1]]
docsTDM <- DocumentTermMatrix(docs8)
docsTDM2 <- as.matrix(docsTDM)
docsdissim <- dist(docsTDM2, method = "cosine")
docsdissim2 <- as.matrix(docsdissim)
rownames(docsdissim2) <- titles
colnames(docsdissim2) <- titles
docsdissim2
h <- hclust(docsdissim, method = "ward")
plot(h, labels = titles, sub = "")

Resources