R: tm package on German text - r

I want to perform Sentiment classification on German dataset, I am using the following code, which works fine with english text, but raising error in case of German text.
Here is my code for the following:
#loading required libraries
library(tm)
library(readxl)
library(data.table)
library(plyr)
library(dplyr)
library(zoo)
library(ggplot2)
library(ranger)
library(e1071)
df<- data.table(read_excel("data/German2datasets.xlsx", skip = 1))
# An abstract function to preprocess a text column
preprocess <- function(text_column)
{
# Use tm to get a doc matrix
corpus <- Corpus(VectorSource(text_column))
# all lower case
corpus <- tm_map(corpus, content_transformer(tolower))
# remove punctuation
corpus <- tm_map(corpus, content_transformer(removePunctuation))
# remove numbers
corpus <- tm_map(corpus, content_transformer(removeNumbers))
# remove stopwords
corpus <- tm_map(corpus, removeWords, stopwords("german"))
# stem document
corpus <- tm_map(corpus, stemDocument)
# strip white spaces (always at the end)
corpus <- tm_map(corpus, stripWhitespace)
# return
corpus
}
# Get preprocess training and test data
corpus <- preprocess(df$TEXT)
# Create a Document Term Matrix for train and test
# Just including bi and tri-grams
Sys.setenv(JAVA_HOME='D://Program Files/Java/jre1.8.0_112') # for 32-bit version
library(rJava)
library(RWeka)
# Bi-Trigram tokenizer function (you can always get longer n-grams)
bitrigramtokeniser <- function(x, n) {
RWeka:::NGramTokenizer(x, RWeka:::Weka_control(min = 2, max = 3))
}
"
Remove remove words <=2
TdIdf weighting
Infrequent (< than 1% of documents) and very frequent (> 80% of documents) terms not included
"
dtm <- DocumentTermMatrix(corpus, control=list(wordLengths=c(2, Inf),
tokenize = bitrigramtokeniser,
weighting = function(x) weightTfIdf(x, normalize = FALSE),
bounds=list(global=c(floor(length(corpus)*0.01), floor(length(corpus)*.8)))))
sent <- df$Sentiment
# Variable selection
# ~~~~~~~~~~~~~~~~~~~~
"
For dimension reduction.
The function calculates chi-square value for each phrase and keeps phrases with highest chi_square values
Ideally you want to put variable selection as part of cross-validation.
chisqTwo function takes:
document term matrix (dtm),
vector of labels (labels), and
number of n-grams you want to keep (n_out)
"
chisqTwo <- function(dtm, labels, n_out=2000){
mat <- as.matrix(dtm)
cat1 <- colSums(mat[labels==T,]) # total number of times phrase used in cat1
cat2 <- colSums(mat[labels==F,]) # total number of times phrase used in cat2
n_cat1 <- sum(mat[labels==T,]) - cat1 # total number of phrases in soft minus cat1
n_cat2 <- sum(mat[labels==F,]) - cat2 # total number of phrases in hard minus cat2
num <- (cat1*n_cat2 - cat2*n_cat1)^2
den <- (cat1 + cat2)*(cat1 + n_cat1)*(cat2 + n_cat2)*(n_cat1 + n_cat2)
chisq <- num/den
chi_order <- chisq[order(chisq)][1:n_out]
mat <- mat[, colnames(mat) %in% names(chi_order)]
}
n <- nrow(dtm)
shuffled <- dtm[sample(n),]
train_dtm <- shuffled[1:round(0.7 * n),]
test_dtm <- shuffled[(round(0.7 * n) + 1):n,]
"
With high dimensional data, test matrix may not have all the phrases training matrix has.
This function fixes that - so that test matrix has the same columns as training.
testmat takes column names of training matrix (train_mat_cols), and
test matrix (test_mat)
and outputs test_matrix with the same columns as training matrix
"
# Test matrix maker
testmat <- function(train_mat_cols, test_mat){
# train_mat_cols <- colnames(train_mat); test_mat <- as.matrix(test_dtm)
test_mat <- test_mat[, colnames(test_mat) %in% train_mat_cols]
miss_names <- train_mat_cols[!(train_mat_cols %in% colnames(test_mat))]
if(length(miss_names)!=0){
colClasses <- rep("numeric", length(miss_names))
df <- read.table(text = '', colClasses = colClasses, col.names = miss_names)
df[1:nrow(test_mat),] <- 0
test_mat <- cbind(test_mat, df)
}
as.matrix(test_mat)
}
# Train and test matrices
train_mat <- chisqTwo(train_dtm, train$Sentiment)
test_mat <- testmat(colnames(train_mat), as.matrix(test_dtm))
dim(train_mat)
dim(test_mat)
n <- nrow(df)
shuffled <- df[sample(n),]
train_data <- shuffled[1:round(0.7 * n),]
test_data <- shuffled[(round(0.7 * n) + 1):n,]
train_mat <- as.data.frame(as.matrix(train_mat))
colnames(train_mat) <- make.names(colnames(train_mat))
train_mat$Sentiment <- train_data$Sentiment
test_mat <- as.data.frame(as.matrix(test_mat))
colnames(test_mat) <- make.names(colnames(test_mat))
test_mat$Sentiment <- test_data$Sentiment
train_mat$Sentiment <- as.factor(train_mat$Sentiment)
test_mat$Sentiment <- as.factor(test_mat$Sentiment)
Then, I will apply caret ML algos on the same for prediction of the Sentiment on the train and test data created.
I am getting the following error at "preprocess" function.
> corpus <- preprocess(df$TEXT)
Show Traceback
Rerun with Debug
Error in FUN(content(x), ...) :
invalid input 'Ich bin seit Jahren zufrieden mit der Basler Versicherubg🌺' in 'utf8towcs'
Data - https://drive.google.com/open?id=1T_LpL2G8upztihAC2SQeVs4YCPH-yfOs

How about trying a different package to get to the pre-Weka etc stages? This is equivalent (and simpler imho):
library("quanteda")
library("readtext")
# reads in the spreadsheet and creates the corpus
germancorp <-
readtext("data/German2datasets.xlsx", text_field = "TEXT")) %>%
corpus()
# does all of the steps of your preprocess() function
dtm <- dfm(germancorp, ngrams = c(2, 3),
tolower = TRUE,
remove_punct = TRUE,
remove_numbers = TRUE,
remove = stopwords("german"),
stem = TRUE)
# remove words with only a single count
dtm <- dfm_trim(dtm, min_count = 2)
# form tf-idf weights - change the base argument from default 10 if you wish
dtm <- dfm_tfidf(dtm)
# if you really want a tm formatted DocumentTermMatrix
convert(dtm, to = "tm")
The quanteda package can do some of what you list as additional steps, although it is not clear exactly what you are doing. (Your question focused on the preprocess() failure so I answered that.)

if you haven´t found the reason yet:
invalid input in 'utf8towcs'
It is the encoding of the file (depending on your [virtual] environment and the current sys-options and of course on the the of saving the file to disk at the time of creation)
A workaround is like:
usableText=str_replace_all(tweets$text,"[^[:graph:]]", " ")
or
your_corpus<- tm_map(your_corpus,toSpace,"[^[:graph:]]")

Related

How to export rStudio analysis detail

working on a document classification model using the code provided by Tim DAuria (https://www.youtube.com/watch?v=j1V2McKbkLo), but can not figure out how to actually see the detail analysis of the classification on the 'Test' data.
I am using the model to help classify contracts by type and want to see the specific classification assigned to the different contracts. For example, the model assigns 15 contracts as "x" type of contract. How can I view those 15 file names? The code below works great for the classification piece. Just posting it for reference.
Please help! Really new at this and I'm sure I'm missing something obvious but could not find anything on the web.
Classification Code below:
#int
libs <- c("tm", "plyr","class")
lapply(libs, require, character.only = TRUE)
#set options
options(stringsAsFactors = FALSE)
#set parameters
contract <- c("build construction", "other")
pathname <- ..Desktop/ML/ContractReview"
#clean text
cleanCorpus <- function(corpus) {
corpus.tmp <- tm_map(corpus, removePunctuation)
corpus.tmp <- tm_map(corpus.tmp, stripWhitespace)
corpus.tmp <- tm_map(corpus.tmp, content_transformer(tolower))
corpus.tmp <- tm_map(corpus.tmp, removeWords, stopwords("english"))
corpus.tmp <- tm_map(corpus.tmp, stemDocument)
return(corpus.tmp)
}
#build TDM
generateTDM <- function(contract, path) {
c.dir <- sprintf ("%s/%s", path, contract)
c.cor <- VCorpus(DirSource(directory = c.dir), readerControl = list(reader=readPlain))
c.cor.cl <- cleanCorpus(c.cor)
c.tdm <- TermDocumentMatrix(c.cor.cl)
c.tdm <- removeSparseTerms(c.tdm, .07)
result <- list(name = contract, tdm = c.tdm)
}
tdm <- lapply(contract, generateTDM, path = pathname)
# attach name
bindcontractToTDM <- function(tdm) {
c.mat <-t(data.matrix(tdm[["tdm"]]))
c.df <- as.data.frame(c.mat, stringsAsFactors = FALSE)
c.df <- cbind(c.df, rep(tdm[["name"]], nrow(c.df)))
colnames(c.df) [ncol(c.df)] <- "targetcontract"
return(c.df)
}
contractTDM <- lapply(tdm, bindcontractToTDM)
#stack if you have more than one dataframe
tdm.stack <- do.call(rbind.fill, contractTDM)
tdm.stack[is.na(tdm.stack)] <-0
#hold-out
train.idx <- sample(nrow(tdm.stack), ceiling(nrow(tdm.stack)* 0.7))
test.idx <- (1:nrow(tdm.stack))[- train.idx]
#model - knn
tdm.contract <-tdm.stack[, "targetcontract"]
tdm.stack.nl <- tdm.stack[, !colnames(tdm.stack) %in% "targetcontract"]
knn.pred <- knn(tdm.stack.nl[train.idx, ], tdm.stack.nl[test.idx, ], tdm.contract[train.idx])
#accuracy
conf.mat <- table("predictions"= knn.pred, Actual = tdm.contract[test.idx])
(accuracy <- sum(diag(conf.mat)) / length(test.idx)*100)
The answer to your question is hidden in the knn.pred object, which stores predicted labels for the test cases.
Since there is no input data provided (I don't think it was provided by the author of the video?), I am not sure about the details of the contract classes from your example. However, the output of the knn function from the class package you are using is a factor with labels that the algorithm predicted for the test documents
(you will notice that
length(knn.pred)
is equal to 0.3 * nrow(tdm.stack) ).
If you would like to view/store the predicted label and the actual label for each entry, then you can create a suitable data frame:
label_df <- cbind(label_pred = knn.pred, label_actual = tdm.contract[test.idx])
Alternatively, you can also choose to include the remaining columns of the tdm.contract (if you would like to re-examine the tdm.stack information in that context)
label_df <- cbind(label_pred = knn.pred, label_actual = tdm.contract[test.idx], tdm.stack.nl[test.idx,])
You can then filter either of these data frames to see how your entries of interest have been labelled.
Alternatively, you can choose to run k nearest neighbors algorithm using a function from a different package, in which case the output might differ.

Calculating Proportion of Column Total compared to total of all columns

I'm attempting to calculate the Proportion of a column total compared to the total of all columns.
The Document Term Matrix I'm working with is quite large, causing any tests I run incorrectly to basically crash Rstudio.
Here is my working code so far:
randomSample = read.csv("randomSample2016.csv", stringsAsFactors = FALSE)
str(randomSample)
randomSample$tweet <- as.character(randomSample$tweet)
randomSample$tweetlength <- nchar(randomSample$tweet)
hist(randomSample$tweetlength)
library("tm")
## Use the tm library to construct a document-term matrix of term
frequencies
randomSample_corpus <- Corpus(VectorSource(randomSample$tweet))
print(randomSample_corpus)
inspect(randomSample_corpus[1:3])
#clean up corpus
#make all letters lowercase
randomSample_corpus_clean <- tm_map(randomSample_corpus, tolower)
#Remove Numbers
randomSample_corpus_clean <- tm_map(randomSample_corpus_clean,
removeNumbers)
#Remove punctuation
randomSample_corpus_clean <- tm_map(randomSample_corpus_clean,
removePunctuation)
#Remove stop words
randomSample_corpus_clean <- tm_map(randomSample_corpus_clean, removeWords,
stopwords())
#remove unneeded whitespace
randomSample_corpus_clean <- tm_map(randomSample_corpus_clean,
stripWhitespace)
#Inspect cleaned corpus
inspect(randomSample_corpus_clean[1:3])
#Create document term matrix
randomSample_dtm <- DocumentTermMatrix(randomSample_corpus_clean)
#convert to R matrix
dtm2 <- as.matrix(randomSample_dtm)
#obtain individual word frequencies
frequency <- colSums(dtm2)
This allows me to get the total frequency of all words in dtm2, however when I try to add a new row for column totals (dtm2$newcolumn <- 0) I end up with an insane amount of ram usage.
That's because your colSums returns a named-array. what you want is
as.matrix(randomSample_dtm) %>% {
rbind(., rbind(colSums(.) %>% as.numeric)) %>% as.matrix()
}

A lemmatizing function using a hash dictionary does not work with tm package in R

I would like to lemmatize Polish text using a large external dictionary (format like in txt variable below). I am not lucky, to have an option Polish with popular text mining packages. The answer https://stackoverflow.com/a/45790325/3480717 by #DmitriySelivanov works well with simple vector of texts. (I have also removed Polish diacritics from both the dictionary and corpus.) The function works well with a vector of texts.
Unfortunately it does not work with the corpus format generated by tm. Let me paste Dmitriy's code:
library(hashmap)
library(data.table)
txt =
"Abadan Abadanem
Abadan Abadanie
Abadan Abadanowi
Abadan Abadanu
abadańczyk abadańczycy
abadańczyk abadańczykach
abadańczyk abadańczykami
"
dt = fread(txt, header = F, col.names = c("lemma", "word"))
lemma_hm = hashmap(dt$word, dt$lemma)
lemma_hm[["Abadanu"]]
#"Abadan"
lemma_tokenizer = function(x, lemma_hashmap,
tokenizer = text2vec::word_tokenizer) {
tokens_list = tokenizer(x)
for(i in seq_along(tokens_list)) {
tokens = tokens_list[[i]]
replacements = lemma_hashmap[[tokens]]
ind = !is.na(replacements)
tokens_list[[i]][ind] = replacements[ind]
}
tokens_list
}
texts = c("Abadanowi abadańczykach OutOfVocabulary",
"abadańczyk Abadan OutOfVocabulary")
lemma_tokenizer(texts, lemma_hm)
#[[1]]
#[1] "Abadan" "abadańczyk" "OutOfVocabulary"
#[[2]]
#[1] "abadańczyk" "Abadan" "OutOfVocabulary"
now I would like to apply it on tm corpus "docs" here is an example syntax I would use with tm package, on tm generated corpus.
docs <- tm_map(docs, function(x) lemma_tokenizer(x, lemma_hashmap="lemma_hm"))
another syntax that I tried:
LemmaTokenizer <- function(x) lemma_tokenizer(x, lemma_hashmap="lemma_hm")
docsTDM <-
DocumentTermMatrix(docs, control = list(wordLengths = c(4, 25), tokenize=LemmaTokenizer))
It throws at me an error:
Error in lemma_hashmap[[tokens]] :
attempt to select more than one element in vectorIndex
The function works with a vector of texts but it will not work with tm corpus. Thanks in advance for suggestions (even use of this function with other text mining package if it will not work with tm).
I see two problems here. 1) your custom function returns a list, while it should return a vector of strings; and 2) you are passing a wrong lemma_hashmap argument.
A quick workaround to fix the first problem is to use paste() and sapply() before returning the function result.
lemma_tokenizer = function(x, lemma_hashmap,
tokenizer = text2vec::word_tokenizer) {
tokens_list = tokenizer(x)
for(i in seq_along(tokens_list)) {
tokens = tokens_list[[i]]
replacements = lemma_hashmap[[tokens]]
ind = !is.na(replacements)
tokens_list[[i]][ind] = replacements[ind]
}
# paste together, return a vector
sapply(tokens_list, (function(i){paste(i, collapse = " ")}))
}
We can run the same example of your post.
texts = c("Abadanowi abadańczykach OutOfVocabulary",
"abadańczyk Abadan OutOfVocabulary")
lemma_tokenizer(texts, lemma_hm)
[1] "Abadan abadańczyk OutOfVocabulary" "abadańczyk Abadan OutOfVocabulary"
Now, we can use tm_map. Just make sure to use lemma_hm (i.e., the variable) and not "lemma_hm" (a string) as argument.
docs <- SimpleCorpus(VectorSource(texts))
out <- tm_map(docs, (function(x) {lemma_tokenizer(x, lemma_hashmap=lemma_hm)}))
out[[1]]$content
[1] "Abadan abadańczyk OutOfVocabulary"
For polish lemmatization please refer to this script
https://github.com/MarcinKosinski/trigeR5/blob/master/R/lematyzacja.R that uses this polmorfologik dictionary https://github.com/MarcinKosinski/trigeR5/tree/master/dicts (and also stop words are included there).
Try using quanteda's dictionary() function, after creating a dictionary mapping each variant as a dictionary value, to the lemma as a dictionary key. Below, it looks up your values and then pastes the tokens back into a text. (If you wanted tokens, you would not need the last paste() operation.
txt <-
"Abadan Abadanem
Abadan Abadanie
Abadan Abadanowi
Abadan Abadanu
abadańczyk abadańczycy
abadańczyk abadańczykach
abadańczyk abadańczykami"
list_temp <- strsplit(readLines(textConnection(txt)), "\\s+")
list_temp2 <- lapply(list_temp, "[", 2)
names(list_temp2) <- sapply(list_temp, "[", 1)
library("quanteda")
polish_lemma_dict <- dictionary(list_temp2)
# Dictionary object with 7 key entries.
# - Abadan:
# - abadanem
# - Abadan:
# - abadanie
# - Abadan:
# - abadanowi
# - Abadan:
# - abadanu
# - abadańczyk:
# - abadańczycy
# - abadańczyk:
# - abadańczykach
# - abadańczyk:
# - abadańczykami
texts <- c("Abadanowi abadańczykach OutOfVocabulary",
"abadańczyk Abadan OutOfVocabulary")
The texts can now be converted into tokens, and use quanteda's tokens_lookup() function to replace the dictionary values (inflected words) with the dictionary keys (lemmas). In the last step, I've pasted the tokens back together, which you can skip if you want tokens and not a full text.
require(magrittr)
texts %>%
tokens() %>%
tokens_lookup(dictionary = polish_lemma_dict, exclusive = FALSE, capkeys = FALSE) %>%
as.character() %>%
paste(collapse = " ")
# [1] "Abadan abadańczyk OutOfVocabulary abadańczyk Abadan OutOfVocabulary"
Here is the complete imperfect code I used the answer in. Credits to many people, I described all sources on the bottom. It is very rough, I realise, but it catches mise for me, ie. I can use txt lemmes dictionary and my stopwords to classify Polish texts. Thanks to Damiano Fantini, Dmitriy Selivanov and many others.
#----1. Set up. ----
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
library(readtext)
library(tm)
library(proxy)
library(stringi)
library(stringr)
library(hashmap)
library(data.table)
library(text2vec)
# For reading n-grams
library(RWeka) #(*)
BigramTokenizer <-
function(x) NGramTokenizer(x, Weka_control(min = 1, max = 3)) #(*)
#----2. Read data. ----
stopwordsPL <- as.vector(str_split(readLines("polish.stopwords.text",encoding = "UTF-8"), pattern = " ",simplify = T))
docs <- VCorpus(DirSource(pattern="txt"))
titles <- rownames(summary(docs))
docs <- tm_map(docs, removeWords, words=stopwordsPL)
docs <- tm_map(docs, tolower)
docs <- tm_map(docs, function(x) stri_trans_general(x, "Latin-ASCII"))
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, stripWhitespace)
# for English texts it would be simpler
# docs <- tm_map(docs, removeWords, stopwords("english")) #can add other words to remove
# docs <- tm_map(docs, stemDocument, "english")
#====3. Lemmatize ====
# # Dictionary from http://www.lexiconista.com/datasets/lemmatization/
# lemmadict_file = "lemmatization-pl.text"
# dt = fread(file= lemmadict_file, header = F, col.names = c("lemma", "word"), data.table=T, encoding="UTF-8")
# # I threw away Polish letters, maybe changing locales may help.
# dt$lemma <- stri_trans_general(dt$lemma, "Latin-ASCII;lower")
# dt$word <- stri_trans_general(dt$word, "Latin-ASCII;lower")
# dt <- unique(dt)
#
# # Creating hash dictionary
# lemma_hm = hashmap(dt$word, dt$lemma)
#
# # Test if it works
# lemma_hm[["mnozyl"]]
# # [1] "mnozyc"
#
# save_hashmap(lemma_hm, file="lemma_hm", overwrite = TRUE, compress = TRUE)
lemma_hm <- load_hashmap(file="lemma_hm")
lemma_tokenizer = function(x, lemma_hashmap,
tokenizer = text2vec::word_tokenizer) {
tokens_list = tokenizer(x)
for(i in seq_along(tokens_list)) {
tokens = tokens_list[[i]]
replacements = lemma_hashmap[[tokens]]
ind = !is.na(replacements)
tokens_list[[i]][ind] = replacements[ind]
}
# paste together, return a vector
sapply(tokens_list, (function(i){paste(i, collapse = " ")}))
}
docs <- tm_map(docs, (function(x) {lemma_tokenizer(x, lemma_hashmap=lemma_hm)}))
docs <- tm_map(docs, PlainTextDocument)
#====4. Create document term matrix====
docsTDM <-
DocumentTermMatrix(docs, control = list(wordLengths = c(5, 25),tokenize = BigramTokenizer)) # tokenize=LemmaTokenizer, tokenize = BigramTokenizer (*)
docsTDM$dimnames
#====5. Remove sparse and common words====
docsTDM <- removeSparseTerms(docsTDM, .90)
# https://stackoverflow.com/questions/25905144/removing-overly-common-words-occur-in-more-than-80-of-the-documents-in-r
removeCommonTerms <- function (x, pct)
{
stopifnot(inherits(x, c("DocumentTermMatrix", "TermDocumentMatrix")),
is.numeric(pct), pct > 0, pct < 1)
m <- if (inherits(x, "DocumentTermMatrix"))
t(x)
else x
t <- table(m$i) < m$ncol * (pct)
termIndex <- as.numeric(names(t[t]))
if (inherits(x, "DocumentTermMatrix"))
x[, termIndex]
else x[termIndex, ]
}
docsTDM <-
removeCommonTerms(docsTDM, .8) #remove terms that are in >=80% of the documents
docsTDM$dimnames
#====6. Cluster data (hclust). ====
docsdissim <- dist(as.matrix(docsTDM), method = "cosine")
docsdissim2 <- as.matrix(docsdissim)
dim(docsdissim2)
rownames(docsdissim2) <- titles
colnames(docsdissim2) <- titles
h <- hclust(docsdissim, method = "ward.D2")
plot(h, labels = titles, sub = "")
# Library hclust with p-values (pvclust)
library(pvclust)
h_pv <- pvclust(docsdissim2, method.hclust = "ward.D2", method.dist ="correlation")
plot(h_pv)
data.frame(cutree(tree = h_pv$hclust, k = 4))
# pvclust provides two types of p-values: AU (Approximately Unbiased) p-value and BP (Bootstrap Probability) value.
# AU p-value, which is computed by multiscale bootstrap resampling, is a better approximation to unbiased p-value
# than BP value computed by normal bootstrap resampling.
# AU p-value > 0.95 we can assume the clusters exist and may stably be
# observed if we increase the number of observations.
# (http://stat.sys.i.kyoto-u.ac.jp/prog/pvclust/)
#==== Literature:====
# Original article:
# http://www.rexamine.com/2014/06/text-mining-in-r-automatic-categorization-of-wikipedia-articles/
# Updates to make it work after some functions became obsolete:
# https://stackoverflow.com/questions/34423823/r-automatic-categorization-of-wikipedia-articles
# https://stackoverflow.com/questions/34372166/error-using-termdocumentmatrix-and-dist-functions-in-r
#
# Based on that:
# http://brazenly.blogspot.co.uk/2015/02/r-categorization-clustering-of.html
#
# Sparse terms:
# https://stackoverflow.com/questions/28763389/how-does-the-removesparseterms-in-r-work
# Lemmatizing function:
# https://stackoverflow.com/questions/46122591/a-lemmatizing-function-using-a-hash-dictionary-does-not-work-with-tm-package-in
# https://stackoverflow.com/questions/45762559/lemmatization-using-txt-file-with-lemmes-in-r/45790325#45790325

Empty term document matrix

I seem to run into a problem whenever I try to inspect my freq. words and associations.
When I make the tdm I get this info:
TermDocumentMatrix
I can see I have plenty of terms to use, in plenty of documents.
However!
When I try to inspect the content of "tdm", I get this info:
Inspecting the TDM
Howcome the tdm all of a sudden is empty?
Hope someone can help
tweets <- userTimeline("RDataMining", n = 1000)
(n.tweet <- length(tweets))
tweets[1:3]
#convert tweets to a data frame
tweets.df <- twListToDF(tweets)
dim(tweets.df)
##Text cleaning
library(tm)
#build a corpus and specify the source to be a character vector
myCorpus <- Corpus(VectorSource(tweets.df$text))
#convert to lower case
myCorpus <- tm_map(myCorpus, content_transformer(tolower))
#remove URLs
removeURL <- function(x) gsub ("http[^[:space:]]*","",x)
myCorpus <- tm_map(myCorpus,content_transformer(removeURL))
#remove anything other than English letters or space
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*","",x)
myCorpus <- tm_map(myCorpus,content_transformer(removeNumPunct))
#remove stopwords + 2
myStopwords <- c(stopwords('english'),"available","via")
#remove "r" and "big" from stopwords
myStopwords <- setdiff(myStopwords, c("r","big"))
#remove stopwords from corpus
myCorpus <- tm_map(myCorpus,removeWords,myStopwords)
#remove extra whitespace
myCorpus <- tm_map(myCorpus, stripWhitespace)
#keep a copy of corpus to use later as a dictionary for stem completion
myCorpusCopy <- myCorpus
#stem words
library(SnowballC)
myCorpus <- tm_map(myCorpus,stemDocument)
stemCompletion2 <- function(x,dictionary) {
x <- unlist(strsplit(as.character(x),""))
#because stemCompletion completes an empty string to a word in dict. Remove empty string to avoid this
x <- x[x !=""]
x <- stemCompletion(x, dictionary = dictionary)
x <- paste (x,sep = "",collapse = "")
PlainTextDocument(stripWhitespace(x))
}
myCorpus <- lapply(myCorpus, stemCompletion2, dictionary = myCorpusCopy)
myCorpus <- Corpus(VectorSource(myCorpus))
#count freq of "mining"
miningCases <- lapply(myCorpusCopy,
function(x) {grep(as.character(x),pattern = "\\<mining")})
sum(unlist(miningCases))
#count freq of "miner"
miningCases <- lapply(myCorpusCopy,
function(x) {grep(as.character(x),pattern = "\\<miner")})
sum(unlist(miningCases))
#count freq of "r"
miningCases <- lapply(myCorpusCopy,
function(x) {grep(as.character(x),pattern = "\\<r")})
sum(unlist(miningCases))
#replace "miner" with "mining"
myCorpus <- tm_map(myCorpus,content_transformer(gsub),
pattern = "miner", replacement = "mining")
tdm <- TermDocumentMatrix(myCorpus, control = list(removePunctuation = TRUE,stopwords = TRUE))
tdm
##Freq words and associations
idx <- which(dimnames(tdm)$Terms == "r")
inspect(tdm[idx + (0:5), 101:110])
#inspect frequent words
(freq.terms <- findFreqTerms(tdm, lowfreq = 15))
term.freq <- rowSums(as.matrix(tdm))
term.freq <- subset(term.freq,term.freq >= 15)
df <- data.frame(term = names(term.freq), freq = term.freq)
I've been using the following Twitter query to test your code:
tweets = searchTwitter("r data mining", n=10)
and I think the problem is with your function stemCompletion2, which should look something like this:
stemCompletion2 <- function(x,dictionary) {
x <- unlist(strsplit(as.character(x)," "))
print("before:")
print(x)
#because stemCompletion completes an empty string to a word in dict. Remove empty string to avoid this
x <- x[x !=""]
x <- stemCompletion(x, dictionary = dictionary)
print("after:")
print(x)
x <- paste(x, sep = " ")
PlainTextDocument(stripWhitespace(x))
}
The modifications are as follows: before you had
x <- unlist(strsplit(as.character(x),""))
which was creating a list with all the characters of in each of the documents, and I've modified it to
x <- unlist(strsplit(as.character(x)," "))
to create a list of words. Similarly, when recomposing your documents, you where doing
x <- paste (x,sep = "",collapse = "")
which was creating the long strings you mention in your post, and I've modified it to:
x <- paste(x, sep = " ")
to recompose the words.
One example of the completions would be for my data:
[1] "before:"
[1] "rt" "ebookdealalert" "r" "datamin" "project" "learn" "data" "mine"
[9] "realworld" "project" "book" "solv" "predict" "model"
[1] "after:"
rt ebookdealalert r datamin project learn data mine
"rt" "ebookdealalerts" "r" "datamining" "projects" "learn" "data" ""
realworld project book solv predict model
"realworld" "projects" "book" "solve" "predictive" "modeling"
After that step, you may be able to work with TermDocumentMatrix as expected.
Hope it helps.

Support Vector Machine works on Training-set but not on Test-set in R (using e1071)

I'm using a support vector machine for my document classification task! it classifies all my Articles in the training-set, but fails to classify the ones in my test-set!
trainDTM is the document term matrix of my training-set. testDTM is the one for the test-set.
here's my (not so beautiful) code:
# create data.frame with labelled sentences
labeled <- as.data.frame(read.xlsx("C:\\Users\\LABELED.xlsx", 1, header=T))
# create training set and test set
traindata <- as.data.frame(labeled[1:700,c("ARTICLE","CLASS")])
testdata <- as.data.frame(labeled[701:1000, c("ARTICLE","CLASS")])
# Vector, Source Transformation
trainvector <- as.vector(traindata$"ARTICLE")
testvector <- as.vector(testdata$"ARTICLE")
trainsource <- VectorSource(trainvector)
testsource <- VectorSource(testvector)
# CREATE CORPUS FOR DATA
traincorpus <- Corpus(trainsource)
testcorpus <- Corpus(testsource)
# my own stopwords
sw <- c("i", "me", "my")
## CLEAN TEXT
# FUNCTION FOR CLEANING
cleanCorpus <- function(corpus){
corpus.tmp <- tm_map(corpus, removePunctuation)
corpus.tmp <- tm_map(corpus.tmp,stripWhitespace)
corpus.tmp <- tm_map(corpus.tmp,tolower)
corpus.tmp <- tm_map(corpus.tmp, removeWords, sw)
corpus.tmp <- tm_map(corpus.tmp, removeNumbers)
corpus.tmp <- tm_map(corpus.tmp, stemDocument, language="en")
return(corpus.tmp)}
# CLEAN CORP WITH ABOVE FUNCTION
traincorpus.cln <- cleanCorpus(traincorpus)
testcorpus.cln <- cleanCorpus(testcorpus)
## CREATE N-GRAM DOCUMENT TERM MATRIX
# CREATE N-GRAM TOKENIZER
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
# CREATE DTM
trainmatrix.cln.bi <- DocumentTermMatrix(traincorpus.cln, control = list(tokenize = BigramTokenizer))
testmatrix.cln.bi <- DocumentTermMatrix(testcorpus.cln, control = list(tokenize = BigramTokenizer))
# REMOVE SPARSE TERMS
trainDTM <- removeSparseTerms(trainmatrix.cln.bi, 0.98)
testDTM <- removeSparseTerms(testmatrix.cln.bi, 0.98)
# train the model
SVM <- svm(as.matrix(trainDTM), as.factor(traindata$CLASS))
# get classifications for training-set
results.train <- predict(SVM, as.matrix(trainDTM)) # works fine!
# get classifications for test-set
results <- predict(SVM,as.matrix(testDTM))
Error in scale.default(newdata[, object$scaled, drop = FALSE], center = object$x.scale$"scaled:center", :
length of 'center' must equal the number of columns of 'x'
i don't understand this error. and what is 'center' ?
thank you!!
Train and test data must be in the same features space ; building two separates DTM in that way can't work.
A solution with using RTextTools :
DocTermMatrix <- create_matrix(labeled, language="english", removeNumbers=TRUE, stemWords=TRUE, ...)
container <- create_container(DocTermMatrix, labels, trainSize=1:700, testSize=701:1000, virgin=FALSE)
models <- train_models(container, "SVM")
results <- classify_models(container, models)
Or, to answer your question (with e1071), you can specify the vocabulary ('features') in the projection (DocumentTermMatrix) :
DocTermMatrixTrain <- DocumentTermMatrix(Corpus(VectorSource(trainDoc)));
Features <- DocTermMatrixTrain$dimnames$Terms;
DocTermMatrixTest <- DocumentTermMatrix(Corpus(VectorSource(testDoc)),control=list(dictionary=Features));

Resources