i have created a corpus and processed it using tm package, a snippet below
cleanCorpus<-function(corpus){
corpus.tmp <- tm_map(corpus, content_transformer(tolower))
corpus.tmp <- tm_map(corpus.tmp, removePunctuation)
corpus.tmp <- tm_map(corpus.tmp, removeNumbers)
corpus.tmp <- tm_map(corpus.tmp, removeWords,stopwords("english"))
corpus.tmp <- tm_map(corpus.tmp, stemDocument)
corpus.tmp <- tm_map(corpus.tmp, stripWhitespace)
return(corpus.tmp)
}
myCorpus <-Corpus(VectorSource(Data$body),readerControl = list(reader=readPlain))
cln.corpus<-cleanCorpus(myCorpus)
Now i am using the mpqa lexicon to get the total number of positive words and negative words in each document of the corpus.
so i have the list with me as
pos.words <- lexicon$word[lexicon$Polarity=="positive"]
neg.words <- lexicon$word[lexicon$Polarity=="negative"]
How should i go about comparing the content of each document with the positive and negative list and get the counts of both per document?
i checked other posts on tm dictionaries but looks like the feature is withdrawn.
For example
library(tm)
data("crude")
myCorpus <- crude[1:2]
pos.words <- c("advantag", "easy", "cut")
neg.words <- c("problem", "weak", "uncertain")
weightSenti <- structure(function (m) {
m$v <- rep(1, length(m$v))
m$v[rownames(m) %in% neg.words] <- m$v[rownames(m) %in% neg.words] * -1
attr(m, "weighting") <- c("binarySenti", "binSenti")
m
}, class = c("WeightFunction", "function"), name = "binarySenti", acronym = "binSenti")
tdm <- TermDocumentMatrix(cln.corpus, control=list(weighting=weightSenti, dictionary=c(pos.words, neg.words)))
colSums(as.matrix(tdm))
# 127 144
# 2 -2
Related
I would need to divide this dataframe of 1038319 rows into smaller tables of 25k each, then on each table I have to do the following operation, at the beginning I was doing manually table by table, such as the first two in the following code
comments1 <- comments[1:25000,]
texts1 = comments1$message
corpus1 <- Corpus(VectorSource(texts1))
toSpace <- content_transformer(function(x, pattern) {return (gsub(pattern, " ", x))})
corpus1 <- tm_map(corpus1, toSpace, "-")
corpus1 <- tm_map(corpus1, toSpace, "http")
corpus1 <- tm_map(corpus1, toSpace, ":")
corpus1 <- tm_map(corpus1, content_transformer(tolower))
corpus1 <- tm_map(corpus1, removeNumbers)
corpus1 <- tm_map(corpus1, removeWords, stopwords("english"))
corpus1 <- tm_map(corpus1, removePunctuation)
corpus1 <- tm_map(corpus1, stripWhitespace)
dtm1 = DocumentTermMatrix(corpus1)
freq1 <- colSums(as.matrix(dtm1))
ord1 <- order(freq1, decreasing = TRUE)
freq1[head(ord1)]
wf1 = data.frame(word = names(freq1),
freq = freq1)
#-----------------
comments2 <- comments[25001:50000,]
texts2 = comments2$message
corpus2 <- Corpus(VectorSource(texts2))
toSpace <- content_transformer(function(x, pattern) {return (gsub(pattern, " ", x))})
corpus2 <- tm_map(corpus2, toSpace, "-")
corpus2 <- tm_map(corpus2, toSpace, "http")
corpus2 <- tm_map(corpus2, toSpace, ":")
corpus2 <- tm_map(corpus2, content_transformer(tolower))
corpus2 <- tm_map(corpus2, removeNumbers)
corpus2 <- tm_map(corpus2, removeWords, stopwords("english"))
corpus2 <- tm_map(corpus2, removePunctuation)
corpus2 <- tm_map(corpus2, stripWhitespace)
dtm2 = DocumentTermMatrix(corpus2)
freq2 <- colSums(as.matrix(dtm2))
ord2 <- order(freq2, decreasing = TRUE)
freq2[head(ord2)]
wf2 = data.frame(word = names(freq2), #
freq = freq2)
this above and the example code part of the operation I should do, here I do it for the first two tables, the ranges are 25k of rows, Is there a way to do "for loop"?
Then I would also like to sum all the resulting WF tables into one.
One way to do this is define a function to process the text and then pass each grouping of comments to that functions for processing.
See comments below for the step by step instructions:
#define the function
processtext <- function(comment) {
texts = comments1$message
corpus <- Corpus(VectorSource(texts))
toSpace <- content_transformer(function(x, pattern) {return (gsub(pattern, " ", x))})
corpus <- tm_map(corpus, toSpace, "-")
corpus <- tm_map(corpus, toSpace, "http")
corpus <- tm_map(corpus, toSpace, ":")
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
dtm = DocumentTermMatrix(corpus)
freq <- colSums(as.matrix(dtm))
ord <- order(freq, decreasing = TRUE)
wf = data.frame(word = names(freq), freq = freq)
}
#number of lines per groups
n<- 100
#determine the number groups
numberofgroups <- length(comments$message) %/%n +1
#split into a list of groups
listofcomments <- split(comments, rep(1:numberofgroups, each=n))
#process the list groups
#returns a list of answers
answer <- lapply(listofcomments, processtext)
Now all of the results for each group is stored in one list, which can be further process as a list or individually with answer[[1]]
We have prepared machine learning algorithms like clasification algorithm having features as factors. Topic modelling on text data for which text data is in English
Below script which is prepared .
complete <- subset(complete,select=c(Group,Type,Text,Target))
data <- complete$Text
corpus <-tm_map(corpus,content_transformer(tolower))
toSpace <- content_transformer(function(x, pattern) { return (gsub(pattern, " ", x))})
removeSpecialChars <- function(x) gsub("[^a-zA-Z0-9 ]","",x)
corpus <- tm_map(corpus, toSpace, "/")
corpus <- tm_map(corpus, toSpace, "-")
corpus <- tm_map(corpus, toSpace, ":")
corpus <- tm_map(corpus, toSpace, ";")
corpus <- tm_map(corpus, toSpace, "#")
corpus <- tm_map(corpus, toSpace, "\\(" )
corpus <- tm_map(corpus, toSpace, ")")
corpus <- tm_map(corpus, toSpace, ",")
corpus <- tm_map(corpus, toSpace, "_")
corpus <- tm_map(corpus, content_transformer(removeSpecialChars))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus,stemDocument)
tdm <- DocumentTermMatrix(corpus)
train1 <- as.matrix(tdm)
complete1 <- subset(complete,select=c(Group,Type,Target))
complete1 <- Filter(function(x)(length(unique(x))>1), complete1)
train <- cbind(complete1, train1)
train$Text <- NULL
train$Target <- as.factor(train$Target)
############################################################################################
# Model Run
############################################################################################
fit <-svm(Target ~ ., data = train)
termlist <- list(dictionary = Terms(tdm))
retval <- list(model = fit, termlist = termlist, complete = complete)
saveRDS(retval, "./modelTarget.rds")
Now we will be expecting data in another languages - Chinese/Korean/Japanese/French/Portugese/Spanish .
Wanted to check if R support these types of data especially for text cleaning.
Please advice
Here is the code I use to create bi-grams with frequency list:
library(tm)
library(RWeka)
#data <- myData[,2]
tdm.generate <- function(string, ng){
# tutorial on rweka - http://tm.r-forge.r-project.org/faq.html
corpus <- Corpus(VectorSource(string)) # create corpus for TM processing
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
# corpus <- tm_map(corpus, removeWords, stopwords("english"))
options(mc.cores=1) # http://stackoverflow.com/questions/17703553/bigrams-instead-of-single-words-in-termdocument-matrix-using-r-and-rweka/20251039#20251039
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = ng, max = ng)) # create n-grams
tdm <- TermDocumentMatrix(corpus, control = list(tokenize = BigramTokenizer)) # create tdm from n-grams
tdm
}
source("GenerateTDM.R") # generatetdm function in appendix
tdm <- tdm.generate("The book The book The greatest The book",2)
tdm.matrix <- as.matrix(tdm)
topwords <- rowSums(tdm.matrix)
topwords <- as.numeric(topwords)
hist(topwords, breaks = 10)
tdm.matrix <- as.matrix(tdm)
topwords <- rowSums(tdm.matrix)
head(sort(topwords, decreasing = TRUE))
The result for the above code is:
the book greatest
4 3 1
Instead, I'm looking for the result where bi-grams are shown like:
"the book" "book the"
3 2
What needs to be changed in the above code to get the output as above?
You need to use VCorpus instead of Corpus, I was having the same issue you could check more details here
I'm working on wordcloud in R and so far I'm successful with just the basic stuff however what I want to do is I want to show word cloud of specific location. E.g if I have text like
TEXT LOCATION
True or false? link(#Addition, #Classification) NewYork,USA
Gene deFuser: detecting gene fusion events from protein sequences #bmc #bioinformatics Norwich,UK
Biologists do have a sense of humor, especially computational bio people France
Semantic Inference using #Chemogenomics Data for Drug Discovery London,UK
here is the basic wordcloud code I'm using
library(tm)
library(SnowballC)
library(wordcloud)
DATA<-c('True or false? link(#Addition, #Classification) ','Gene deFuser: detecting gene fusion events from protein sequences #bmc #bioinformatics',' Biologists do have a sense of humor, especially computational bio people','Semantic Inference using #Chemogenomics Data for Drug Discovery')
Location<-c('NewYork,USA','Norwich,UK',' France','London,UK')
jeopQ<-data.frame(DATA,Location)
jeopCorpus <- Corpus(VectorSource(jeopQ$DATA))
jeopCorpus <- tm_map(jeopCorpus, content_transformer(tolower))
jeopCorpus <- tm_map(jeopCorpus, removePunctuation)
jeopCorpus <- tm_map(jeopCorpus, PlainTextDocument)
jeopCorpus <- tm_map(jeopCorpus, removeNumbers)
jeopCorpus <- tm_map(jeopCorpus, removeWords, stopwords('english'))
jeopCorpus <- tm_map(jeopCorpus, stemDocument)
myDTM = TermDocumentMatrix(jeopCorpus, control = list(minWordLength = 1))
m = as.matrix(myDTM)
v = sort(rowSums(m), decreasing = TRUE)
set.seed(4363)
wordcloud(names(v), v,max.words =100,min.freq=3,scale=c(4,0.1), random.order = FALSE,rot.per=.5,vfont=c("sans serif","plain"),colors=palette())
I want something like a separate word cloud for Location having "USA" in it and locations having "UK" in it , and a separate wordcloud for FRANCE, is this possible?
jeopQ<-data.frame(DATA,Location)
# Clean Location
jeopQ$Location <- sub('.*,\\s*','', jeopQ$Location)
# Loop
for(i in unique(jeopQ$Location)){
jeopCorpus <- Corpus(VectorSource(jeopQ$DATA[jeopQ$Location==i]))
jeopCorpus <- tm_map(jeopCorpus, content_transformer(tolower))
jeopCorpus <- tm_map(jeopCorpus, removePunctuation)
jeopCorpus <- tm_map(jeopCorpus, PlainTextDocument)
jeopCorpus <- tm_map(jeopCorpus, removeNumbers)
jeopCorpus <- tm_map(jeopCorpus, removeWords, stopwords('english'))
jeopCorpus <- tm_map(jeopCorpus, stemDocument)
myDTM = TermDocumentMatrix(jeopCorpus, control = list(minWordLength = 1))
m = as.matrix(myDTM)
v = sort(rowSums(m), decreasing = TRUE)
set.seed(4363)
wordcloud(names(v), v,max.words =100,min.freq=3,scale=c(4,0.1), random.order = FALSE,rot.per=.5,vfont=c("sans serif","plain"),colors=palette())
}
I have a large corpus of Russian text. When I build a wordcloud, I see some characters like 'ч' are not rendered. The code looks like this:
dat <- read.csv("news.csv",sep=";",header=TRUE,stringsAsFactors=FALSE)
corpus <- Corpus(VectorSource(dat$Article),
readerControl = list(reader=readPlain,language="ru"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords,
stopwords("russian")))
dtm <- TermDocumentMatrix(corpus)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
pal2 <- brewer.pal(8,"Dark2")
png("wordcloud.png", width=640,height=640)
wordcloud(d$word,d$freq, scale=c(8,.2), min.freq=5, max.words=200,
random.order=FALSE, rot.per=0, colors=pal2)
dev.off()
EDIT
Oh, I did it myself. I just added one line of code to do the trick:
corpus <- tm_map(corpus, iconv, 'cp1251', 'UTF-8')
[from OP's own edit, but repeated here as so to complete the Question-Answer]
You need to add, along with the other tm_map() calls.
corpus <- tm_map(corpus, iconv, 'cp1251', 'UTF-8')