We have prepared machine learning algorithms like clasification algorithm having features as factors. Topic modelling on text data for which text data is in English
Below script which is prepared .
complete <- subset(complete,select=c(Group,Type,Text,Target))
data <- complete$Text
corpus <-tm_map(corpus,content_transformer(tolower))
toSpace <- content_transformer(function(x, pattern) { return (gsub(pattern, " ", x))})
removeSpecialChars <- function(x) gsub("[^a-zA-Z0-9 ]","",x)
corpus <- tm_map(corpus, toSpace, "/")
corpus <- tm_map(corpus, toSpace, "-")
corpus <- tm_map(corpus, toSpace, ":")
corpus <- tm_map(corpus, toSpace, ";")
corpus <- tm_map(corpus, toSpace, "#")
corpus <- tm_map(corpus, toSpace, "\\(" )
corpus <- tm_map(corpus, toSpace, ")")
corpus <- tm_map(corpus, toSpace, ",")
corpus <- tm_map(corpus, toSpace, "_")
corpus <- tm_map(corpus, content_transformer(removeSpecialChars))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus,stemDocument)
tdm <- DocumentTermMatrix(corpus)
train1 <- as.matrix(tdm)
complete1 <- subset(complete,select=c(Group,Type,Target))
complete1 <- Filter(function(x)(length(unique(x))>1), complete1)
train <- cbind(complete1, train1)
train$Text <- NULL
train$Target <- as.factor(train$Target)
############################################################################################
# Model Run
############################################################################################
fit <-svm(Target ~ ., data = train)
termlist <- list(dictionary = Terms(tdm))
retval <- list(model = fit, termlist = termlist, complete = complete)
saveRDS(retval, "./modelTarget.rds")
Now we will be expecting data in another languages - Chinese/Korean/Japanese/French/Portugese/Spanish .
Wanted to check if R support these types of data especially for text cleaning.
Please advice
Related
I would need to divide this dataframe of 1038319 rows into smaller tables of 25k each, then on each table I have to do the following operation, at the beginning I was doing manually table by table, such as the first two in the following code
comments1 <- comments[1:25000,]
texts1 = comments1$message
corpus1 <- Corpus(VectorSource(texts1))
toSpace <- content_transformer(function(x, pattern) {return (gsub(pattern, " ", x))})
corpus1 <- tm_map(corpus1, toSpace, "-")
corpus1 <- tm_map(corpus1, toSpace, "http")
corpus1 <- tm_map(corpus1, toSpace, ":")
corpus1 <- tm_map(corpus1, content_transformer(tolower))
corpus1 <- tm_map(corpus1, removeNumbers)
corpus1 <- tm_map(corpus1, removeWords, stopwords("english"))
corpus1 <- tm_map(corpus1, removePunctuation)
corpus1 <- tm_map(corpus1, stripWhitespace)
dtm1 = DocumentTermMatrix(corpus1)
freq1 <- colSums(as.matrix(dtm1))
ord1 <- order(freq1, decreasing = TRUE)
freq1[head(ord1)]
wf1 = data.frame(word = names(freq1),
freq = freq1)
#-----------------
comments2 <- comments[25001:50000,]
texts2 = comments2$message
corpus2 <- Corpus(VectorSource(texts2))
toSpace <- content_transformer(function(x, pattern) {return (gsub(pattern, " ", x))})
corpus2 <- tm_map(corpus2, toSpace, "-")
corpus2 <- tm_map(corpus2, toSpace, "http")
corpus2 <- tm_map(corpus2, toSpace, ":")
corpus2 <- tm_map(corpus2, content_transformer(tolower))
corpus2 <- tm_map(corpus2, removeNumbers)
corpus2 <- tm_map(corpus2, removeWords, stopwords("english"))
corpus2 <- tm_map(corpus2, removePunctuation)
corpus2 <- tm_map(corpus2, stripWhitespace)
dtm2 = DocumentTermMatrix(corpus2)
freq2 <- colSums(as.matrix(dtm2))
ord2 <- order(freq2, decreasing = TRUE)
freq2[head(ord2)]
wf2 = data.frame(word = names(freq2), #
freq = freq2)
this above and the example code part of the operation I should do, here I do it for the first two tables, the ranges are 25k of rows, Is there a way to do "for loop"?
Then I would also like to sum all the resulting WF tables into one.
One way to do this is define a function to process the text and then pass each grouping of comments to that functions for processing.
See comments below for the step by step instructions:
#define the function
processtext <- function(comment) {
texts = comments1$message
corpus <- Corpus(VectorSource(texts))
toSpace <- content_transformer(function(x, pattern) {return (gsub(pattern, " ", x))})
corpus <- tm_map(corpus, toSpace, "-")
corpus <- tm_map(corpus, toSpace, "http")
corpus <- tm_map(corpus, toSpace, ":")
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
dtm = DocumentTermMatrix(corpus)
freq <- colSums(as.matrix(dtm))
ord <- order(freq, decreasing = TRUE)
wf = data.frame(word = names(freq), freq = freq)
}
#number of lines per groups
n<- 100
#determine the number groups
numberofgroups <- length(comments$message) %/%n +1
#split into a list of groups
listofcomments <- split(comments, rep(1:numberofgroups, each=n))
#process the list groups
#returns a list of answers
answer <- lapply(listofcomments, processtext)
Now all of the results for each group is stored in one list, which can be further process as a list or individually with answer[[1]]
My previous code has been as below -
corpus <- VCorpus(VectorSource(final_data$comment))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeWords, stopwords())
corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, removeWords, 'brw')
corpus <- tm_map(corpus, removeWords, 'cid')
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, trimws)
dtm <- DocumentTermMatrix(corpus)
I am getting following error on last command (DocumentTermMatrix) -
'no applicable method for 'meta' applied to an object of class
"character"'
Can you please let me know how to fix it ?
The use of this line of code is causing an issue tm_map(corpus, trimws). The result is a character string instead of the document. This messes up the corpus. If you want to use other function within tm_map that is not part of the tm package, you need to use the function content_transformer.
If you change your last line of code to the one below it should work.
corpus <- tm_map(crude, content_transformer(function(x) trimws(x)))
dtm <- DocumentTermMatrix(corpus)
I have a code like this:
nf<- read.csv("test2.csv")#test 2 is containing 79 rows(name of documents) and one column of text as containing document.
corpus <- Corpus(VectorSource(nf$segment))
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, function(x) removeWords(x, stopwords("english")))
corpus <- tm_map(corpus, function(x) removeWords(x,"shall"))
corpus <- tm_map(corpus, function(x) removeWords(x,"will"))
corpus <- tm_map(corpus, function(x) removeWords(x,"can"))
corpus <- tm_map(corpus, function(x) removeWords(x,"could"))
corpus <- tm_map(corpus, stemDocument, language = "english")
td.mat <- as.matrix(TermDocumentMatrix(corpus))
dist.mat <- dist(t(as.matrix(td.mat)))
ft <- hclust(dist.mat, method="ward.D2")
plot(ft)
my dendrogram
I have cluster dendrogram from documents. if I cut I in height=50 .how I can have the terms in this level?
I am trying to create corpus, but in that I wants to combine 2 consecutive words in document, I didn't want corpus of single words.
I am using below script. Is there a way in which I can create corpus "docs" which will be inclusion of combined 2 consecutive words in each document? Please advise.
library(plyr)
library(tm)
library(e1071)
setwd("C:/Assignment/Assignment-Group-Prediction/IPM")
training<- read.csv("Data.csv",header=T,na.strings=c(""))
Res_Desc_Train <- subset(training,select=c("Group","Description"))
##Step 1 : Create Document Matrix
docs <- Corpus(VectorSource(Res_Desc_Train$Description))
docs <-tm_map(docs,content_transformer(tolower))
#remove potentially problematic symbols
toSpace <- content_transformer(function(x, pattern) { return (gsub(pattern, " ", x))})
removeSpecialChars <- function(x) gsub("[^a-zA-Z0-9 ]","",x)
docs <- tm_map(docs, toSpace, "/")
docs <- tm_map(docs, toSpace, "-")
docs <- tm_map(docs, toSpace, ":")
docs <- tm_map(docs, toSpace, ";")
docs <- tm_map(docs, toSpace, "#")
docs <- tm_map(docs, toSpace, "\\(" )
docs <- tm_map(docs, toSpace, ")")
docs <- tm_map(docs, toSpace, ",")
docs <- tm_map(docs, toSpace, "_")
docs <- tm_map(docs, content_transformer(removeSpecialChars))
docs <- tm_map(docs, content_transformer(tolower))
docs <- tm_map(docs, removeWords, stopwords("en"))
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, stripWhitespace)
docs <- tm_map(docs, removeNumbers)
The FAQ of the tm package answers your question directly:
Can I use bigrams instead of single tokens in a term-document matrix?
Yes. Package NLP provides functionality to compute n-grams which can be used to construct a corresponding tokenizer. E.g.:
library("tm")
data("crude")
BigramTokenizer <-
function(x)
unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
tdm <- TermDocumentMatrix(crude, control = list(tokenize = BigramTokenizer))
inspect(removeSparseTerms(tdm[, 1:10], 0.7))
I have a large corpus of Russian text. When I build a wordcloud, I see some characters like 'ч' are not rendered. The code looks like this:
dat <- read.csv("news.csv",sep=";",header=TRUE,stringsAsFactors=FALSE)
corpus <- Corpus(VectorSource(dat$Article),
readerControl = list(reader=readPlain,language="ru"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords,
stopwords("russian")))
dtm <- TermDocumentMatrix(corpus)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
pal2 <- brewer.pal(8,"Dark2")
png("wordcloud.png", width=640,height=640)
wordcloud(d$word,d$freq, scale=c(8,.2), min.freq=5, max.words=200,
random.order=FALSE, rot.per=0, colors=pal2)
dev.off()
EDIT
Oh, I did it myself. I just added one line of code to do the trick:
corpus <- tm_map(corpus, iconv, 'cp1251', 'UTF-8')
[from OP's own edit, but repeated here as so to complete the Question-Answer]
You need to add, along with the other tm_map() calls.
corpus <- tm_map(corpus, iconv, 'cp1251', 'UTF-8')