How to remove non UTF-8 characters from text - r

I need help removing non UTF-8 character from my word cloud. So far this is my code. I've tried gsub and removeWords and they are still there in my word cloud and I do not know what to do to get rid of them. Any help would be appreciated. Thank you for your time.
txt <- readLines("11-0.txt")
corpus = VCorpus(VectorSource(txt))
gsub("’","‘","",txt)
corpus = tm_map(corpus, content_transformer(tolower))
corpus = tm_map(corpus, removeWords, stopwords("english"))
corpus = tm_map(corpus, removePunctuation)
corpus = tm_map(corpus, stripWhitespace)
corpus = tm_map(corpus, removeWords, c("gutenberg","gutenbergtm","â€","project"))
tdm = TermDocumentMatrix(corpus)
m = as.matrix(tdm)
v = sort(rowSums(m),decreasing = TRUE)
d = data.frame(word=names(v),freq=v)
wordcloud(d$word,d$freq,max.words = 20, random.order=FALSE, rot.per=0.2, colors=brewer.pal(8, "Dark2"))
Edit: Here is my inconv version
txt <- readLines("11-0.txt")
Encoding(txt) <- "latin1"
iconv(txt, "latin1", "ASCII", sub="")
corpus = VCorpus(VectorSource(txt))
corpus = tm_map(corpus, content_transformer(tolower))
corpus = tm_map(corpus, removeWords, stopwords("english"))
corpus = tm_map(corpus, removePunctuation)
corpus = tm_map(corpus, stripWhitespace)
corpus = tm_map(corpus, removeWords, c("gutenberg","gutenbergtm","project"))
tdm = TermDocumentMatrix(corpus)
m = as.matrix(tdm)
v = sort(rowSums(m),decreasing = TRUE)
d = data.frame(word=names(v),freq=v)
wordcloud(d$word,d$freq,max.words = 20, random.order=FALSE, rot.per=0.2, colors=brewer.pal(8, "Dark2"))
title(main="Alice in Wonderland word cloud",font.main=1,cex.main =1.5)

The signature of gsub is:
gsub(pattern, replacement, x, ignore.case = FALSE, perl = FALSE, fixed = FALSE, useBytes = FALSE)
Not sure what you wanted to do with
gsub("’","‘","",txt)
but that line is probably not doing what you want it to do...
See here for a previous SO question on gsub and non-ascii symbols.
Edit:
Suggested solution using iconv:
Removing all non-ASCII characters:
txt <- "’xxx‘"
iconv(txt, "latin1", "ASCII", sub="")
Returns:
[1] "xxx"

Related

Translate a wordcloud in R

I generated a wordcloud with the most frequent words uttered by a child acquiring Portuguese. I used the R package wordcloud. I would like to generate a translated version (English) of the wordcloud I created. Is that possible? Here is my code:
library(tm)
library(wordcloud)
mct <- read_file('mc.txt')
vs <- VectorSource(mct)
corpus <- Corpus(vs)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, c("pra", "né", "porque", "assim", "opa", stopwords('portuguese')))
corpus <- tm_map(corpus, stripWhitespace)
wordcloud(corpus,
min.freq = 1,
max.words = 60,
random.order = FALSE,
rot.per = 0.35,
colors=brewer.pal(8, "Dark2"))

Does R works for multilingual data

We have prepared machine learning algorithms like clasification algorithm having features as factors. Topic modelling on text data for which text data is in English
Below script which is prepared .
complete <- subset(complete,select=c(Group,Type,Text,Target))
data <- complete$Text
corpus <-tm_map(corpus,content_transformer(tolower))
toSpace <- content_transformer(function(x, pattern) { return (gsub(pattern, " ", x))})
removeSpecialChars <- function(x) gsub("[^a-zA-Z0-9 ]","",x)
corpus <- tm_map(corpus, toSpace, "/")
corpus <- tm_map(corpus, toSpace, "-")
corpus <- tm_map(corpus, toSpace, ":")
corpus <- tm_map(corpus, toSpace, ";")
corpus <- tm_map(corpus, toSpace, "#")
corpus <- tm_map(corpus, toSpace, "\\(" )
corpus <- tm_map(corpus, toSpace, ")")
corpus <- tm_map(corpus, toSpace, ",")
corpus <- tm_map(corpus, toSpace, "_")
corpus <- tm_map(corpus, content_transformer(removeSpecialChars))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus,stemDocument)
tdm <- DocumentTermMatrix(corpus)
train1 <- as.matrix(tdm)
complete1 <- subset(complete,select=c(Group,Type,Target))
complete1 <- Filter(function(x)(length(unique(x))>1), complete1)
train <- cbind(complete1, train1)
train$Text <- NULL
train$Target <- as.factor(train$Target)
############################################################################################
# Model Run
############################################################################################
fit <-svm(Target ~ ., data = train)
termlist <- list(dictionary = Terms(tdm))
retval <- list(model = fit, termlist = termlist, complete = complete)
saveRDS(retval, "./modelTarget.rds")
Now we will be expecting data in another languages - Chinese/Korean/Japanese/French/Portugese/Spanish .
Wanted to check if R support these types of data especially for text cleaning.
Please advice

plot word cloud errors

I was trying to plot a word cloud in R. My code is as below. The wordcloud can be plotted but the font size of all words are equally the same. It does not list the words with higher frequency in the front and show in bigger font sizes. Can anybody point what mistakes I made? Thank you!
install.packages('tm')
library(tm)
reviews <- read.csv('keywords_2.csv',stringsAsFactors = FALSE)
review_text <- paste(reviews$customer_search_term
, collapse =" ")
review_source<-VectorSource(review_text)
corpus<-Corpus(review_source)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
dtm <-DocumentTermMatrix(corpus)
dtm2<-as.matrix(dtm)
install.packages('wordcloud')
library(wordcloud)
frequency<- colSums(dtm2)
str(frequency)
words<-names(frequency)
wordcloud(words,scale=c(1,.0005), min.freq = 3,random.order = FALSE, rot.per = .15)

How detect foreign words in Corpus?

Suppose I am parsing an english corpus with the tm package, and I do the usual cleaning steps.
library(tm)
data("crude")
corpus <- Corpus(crude)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, content_transformer(removeWords)) stopwords("english"))
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, PlainTextDocument)
# text matrices
tdm <- TermDocumentMatrix(corpus)
dtm<- DocumentTermMatrix(corpus)
How do I identify the words written in a different language than the one of the corpus? A similar problem is faced with Python here, but my research did not produces interesting results.
This is not a complete solution, but I feel like it might help. I recently had to do something similar where I had to remove words from a corpus with Chinese characters. I ended up using a custom transformation with a regex to remove anything with a non a-z 0-9 character in it.
corpus <- tm_map(corpus, content_transformer(function(s){
gsub(pattern = '[^a-zA-Z0-9\\s]+',
x = s,
replacement = " ",
ignore.case = TRUE,
perl = TRUE)
}))
For example, if there is a Chinese word in there, it gets removed.
gsub(pattern = '[^a-zA-Z0-9\\s]+',
x = 'English 象形字 Chinese',
replacement = "",
ignore.case = TRUE,
perl = TRUE)
Output: "English Chinese"
It's trickier if you are trying to remove words from a language like Spanish because some letters have an accent while others don't. For example, this doesn't work completely, but maybe it's a start.
gsub(pattern = '[a-zA-Z0-9]+[^a-zA-Z0-9\\s]+[a-zA-Z0-9]+',
x = 'El jalapeño es caliente',
replacement = "",
ignore.case = TRUE,
perl = TRUE)
Output: "El es caliente"
Hope this helps!

Why some cyrillic letters are missing in wordcloud?

I have a large corpus of Russian text. When I build a wordcloud, I see some characters like 'ч' are not rendered. The code looks like this:
dat <- read.csv("news.csv",sep=";",header=TRUE,stringsAsFactors=FALSE)
corpus <- Corpus(VectorSource(dat$Article),
readerControl = list(reader=readPlain,language="ru"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords,
stopwords("russian")))
dtm <- TermDocumentMatrix(corpus)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
pal2 <- brewer.pal(8,"Dark2")
png("wordcloud.png", width=640,height=640)
wordcloud(d$word,d$freq, scale=c(8,.2), min.freq=5, max.words=200,
random.order=FALSE, rot.per=0, colors=pal2)
dev.off()
EDIT
Oh, I did it myself. I just added one line of code to do the trick:
corpus <- tm_map(corpus, iconv, 'cp1251', 'UTF-8')
[from OP's own edit, but repeated here as so to complete the Question-Answer]
You need to add, along with the other tm_map() calls.
corpus <- tm_map(corpus, iconv, 'cp1251', 'UTF-8')

Resources