Stemming words in r does not work as expected - r

I am trying to do a very simple word steming in R and getting something very unexpected. In the code below 'complete' variable is 'NA'. Why can't I complete stem on the word easy?
library(tm)
library(SnowballC)
dict <- c("easy")
stem <- stemDocument(dict, language = "english")
complete <- stemCompletion(stem, dictionary=dict)
Thank You!

You can see the internals of the stemCompletion() function with tm:::stemCompletion.
function (x, dictionary, type = c("prevalent", "first", "longest", "none", "random", "shortest")){
if(inherits(dictionary, "Corpus"))
dictionary <- unique(unlist(lapply(dictionary, words)))
type <- match.arg(type)
possibleCompletions <- lapply(x, function(w) grep(sprintf("^%s",w), dictionary, value = TRUE))
switch(type, first = {
setNames(sapply(possibleCompletions, "[", 1), x)
}, longest = {
ordering <- lapply(possibleCompletions, function(x) order(nchar(x),
decreasing = TRUE))
possibleCompletions <- mapply(function(x, id) x[id],
possibleCompletions, ordering, SIMPLIFY = FALSE)
setNames(sapply(possibleCompletions, "[", 1), x)
}, none = {
setNames(x, x)
}, prevalent = {
possibleCompletions <- lapply(possibleCompletions, function(x) sort(table(x),
decreasing = TRUE))
n <- names(sapply(possibleCompletions, "[", 1))
setNames(if (length(n)) n else rep(NA, length(x)), x)
}, random = {
setNames(sapply(possibleCompletions, function(x) {
if (length(x)) sample(x, 1) else NA
}), x)
}, shortest = {
ordering <- lapply(possibleCompletions, function(x) order(nchar(x)))
possibleCompletions <- mapply(function(x, id) x[id],
possibleCompletions, ordering, SIMPLIFY = FALSE)
setNames(sapply(possibleCompletions, "[", 1), x)
})
}
The x argument is your stemmed terms, dictionary is the unstemmed. The only line that matters is the fifth; it does a simple regex match for the stemmed word in the list of dictionary terms.
possibleCompletions <- lapply(x, function(w) grep(sprintf("^%s",w), dictionary, value = TRUE))
Therefore it fails, since it can't find a match for "easi" with "easy". If you also have the word "easiest" in your dictionary, then both terms match, since there is now a dictionary word with the same beginning four letters to match to.
library(tm)
library(SnowballC)
dict <- c("easy","easiest")
stem <- stemDocument(dict, language = "english")
complete <- stemCompletion(stem, dictionary=dict)
complete
easi easiest
"easiest" "easiest"

wordStem() seems to do it..
library(tm)
library(SnowballC)
dict <- c("easy")
> wordStem(dict)
[1] "easi"

Related

Why can I not print out dataframe from Excel in R

Trying to print out dataframe that is created after importing Excel file into R using following code:
library("readxl")
data <- read_excel("grad programs.xlsx", sheet="Sheet2")
print(data)
But instead of getting the Excel file, I get this really long random message:
print(data)
function (..., list = character(), package = NULL, lib.loc = NULL,
verbose = getOption("verbose"), envir = .GlobalEnv, overwrite = TRUE)
{
fileExt <- function(x) {
db <- grepl("\\.[^.]+\\.(gz|bz2|xz)$", x)
ans <- sub(".*\\.", "", x)
ans[db] <- sub(".*\\.([^.]+\\.)(gz|bz2|xz)$", "\\1\\2",
x[db])
ans
}
my_read_table <- function(...) {
lcc <- Sys.getlocale("LC_COLLATE")
on.exit(Sys.setlocale("LC_COLLATE", lcc))
Sys.setlocale("LC_COLLATE", "C")
read.table(...)
}
stopifnot(is.character(list))
names <- c(as.character(substitute(list(...))[-1L]), list)
if (!is.null(package)) {
if (!is.character(package))
stop("'package' must be a character vector or NULL")
}
paths <- find.package(package, lib.loc, verbose = verbose)
if (is.null(lib.loc))
paths <- c(path.package(package, TRUE), if (!length(package)) getwd(),
paths)
paths <- unique(normalizePath(paths[file.exists(paths)]))
paths <- paths[dir.exists(file.path(paths, "data"))]
dataExts <- tools:::.make_file_exts("data")
if (length(names) == 0L) {
db <- matrix(character(), nrow = 0L, ncol = 4L)
for (path in paths) {
entries <- NULL
packageName <- if (file_test("-f", file.path(path,
"DESCRIPTION")))
basename(path)
else "."
Message is longer than that, but that's the start - any idea why get this message rather than the actual data in the Excel sheet

retrieve a string from grep function in R

I'm searching in some sentences some words using this code
sent <- c()
for (w in my_words){
my_selected_sentences <- grep(paste0('\\b', w, '\\b'), my_sentences, ignore.case = TRUE, value = TRUE, perl = TRUE)
sent <- c(sent, my_selected_sentences)
}
Now I'd like to add in the vector "sent" word or words that have been retrieved. For example
[1] here is the selected sentence.---"the", "sentence"
How I can do? thanks!
You can do something like this (used an artificial example):
my_sentences <- c('hi there','will you do it', 'i will go home', 'salt is good for health')
my_sentences_df <- data.frame(sentences = my_sentences, words_retrieved = "", stringsAsFactors = FALSE)
my_words <- c('will','salt','home')
sent <- c()
for (w in my_words){
my_selected_sentences <- grep(paste0('\\b', w, '\\b'), my_sentences, ignore.case = TRUE, value = TRUE, perl = TRUE)
my_sentences_df[grep(w, my_sentences_df$sentences),"words_retrieved"] <-
paste(my_sentences_df[grep(w, my_sentences_df$sentences),"words_retrieved"], w, sep = ", ")
sent <- c(sent, my_selected_sentences)
}
my_sentences_df$words_retrieved <- sub('.', '', my_sentences_df$words_retrieved)
writeLines(with(my_sentences_df, paste0(my_sentences, " ---", words_retrieved)))

How to replace the string with special characters in R in a loop?

I am trying to replace the strings with special characters using gsub. But I am running into error invalid regular expression '\bc++\b', reason 'Invalid use of repetition operators'.
df = data.frame("word"=c('c++', '.XLS','Java-prog'))
for i in nrow(df){
df$new[i] <- gsub(paste0("\\b", df$word[i], "\\b"), "xx", df$new[i],ignore.case = T)
}
Actual code:
data = data.frame("word"=c('python', 'java'),
"description"=c('Java-script is a statically typed and Python py is a dynamically typed',
'java is a programming language'), stringsAsFactors = FALSE)
ll <- as.list(data$word)
data$new <- data$description
for(i in seq_len(nrow(data))) for(j in seq_along(ll)) {
data$new[i] <- gsub(paste0("\\b", ll[j], "\\b"), "url", data$new[i],ignore.case = T)
}
The expectation is to replace the values with xx.

Error at creating word cloud in R (Error in simple_triplet_matrix: 'i, j, v' different lengths)

I have the following code in R to get the recent tweets about the local mayor candidates and create a wordcloud:
library(twitteR)
library(ROAuth)
require(RCurl)
library(stringr)
library(tm)
library(ggmap)
library(plyr)
library(dplyr)
library(SnowballC)
library(wordcloud)
(...)
setup_twitter_oauth(...)
N = 10000 #Number of twetts
S = 200 #200Km radius from Natal (Covers the whole Natal area)
candidate = 'Carlos+Eduardo'
#Lists so I can add more cities in future codes
lats = c(-5.7792569)
lons = c(-35.200916)
# Gets the tweets from every city
result = do.call(
rbind,
lapply(
1:length(lats),
function(i) searchTwitter(
candidate,
lang="pt-br",
n=N,
resultType="recent",
geocode=paste(lats[i], lons[i], paste0(S,"km"), sep=",")
)
)
)
# Get the latitude and longitude of each tweet,
# the tweet itself, how many times it was re-twitted and favorited,
# the date and time it was twitted, etc and builds a data frame.
result_lat = sapply(result, function(x) as.numeric(x$getLatitude()))
result_lat = sapply(result_lat, function(z) ifelse(length(z) != 0, z, NA))
result_lon = sapply(result, function(x) as.numeric(x$getLongitude()))
result_lon = sapply(result_lon, function(z) ifelse(length(z) != 0, z, NA))
result_date = lapply(result, function(x) x$getCreated())
result_date = sapply(result_date,
function(x) strftime(x, format="%d/%m/%Y %H:%M%S", tz="UTC")
)
result_text = sapply(result, function(x) x$getText())
result_text = unlist(result_text)
is_retweet = sapply(result, function(x) x$getIsRetweet())
retweeted = sapply(result, function(x) x$getRetweeted())
retweet_count = sapply(result, function(x) x$getRetweetCount())
favorite_count = sapply(result, function(x) x$getFavoriteCount())
favorited = sapply(result, function(x) x$getFavorited())
tweets = data.frame(
cbind(
tweet = result_text,
date = result_date,
lat = result_lat,
lon = result_lon,
is_retweet=is_retweet,
retweeted = retweeted,
retweet_count = retweet_count,
favorite_count = favorite_count,
favorited = favorited
)
)
# World Cloud
#Text stemming require the package ‘SnowballC’.
#https://cran.r-project.org/web/packages/SnowballC/index.html
#Create corpus
corpus = Corpus(VectorSource(tweets$tweet))
corpus = tm_map(corpus, removePunctuation)
corpus = tm_map(corpus, removeWords, stopwords('portuguese'))
corpus = tm_map(corpus, stemDocument)
wordcloud(corpus, max.words = 50, random.order = FALSE)
But I'm getting these errors:
Error in simple_triplet_matrix(i = i, j = j, v = as.numeric(v), nrow =
length(allTerms), :
'i, j, v' different lengths
In addition: Warning messages:
1: In doRppAPICall("search/tweets", n, params = params,
retryOnRateLimit = retryOnRateLimit, :
10000 tweets were requested but the API can only return 518
#I understant this one, I cannot get more tweets that exists
2: In mclapply(unname(content(x)), termFreq, control) : all
scheduled cores encountered errors in user code
3: In simple_triplet_matrix(i = i, j = j, v = as.numeric(v), nrow =
length(allTerms), : NAs introduced by coercion
It's my first time building a wordcloud and I followed tutorials like this one.
It's there a way to fix it? Another things is: the class of tweets$tweet is "factor", should I convert it or something? If yes, how I do that?
I think the problem is that wordcloud is not defined for tm corpus objects. Install the quanteda package, and try this:
plot(quanteda::corpus(corpus), max.words = 50, random.order = FALSE)
I followed this tutorial where it's defined a function to "clean" the text and also creating a TermDocumentMatrix instead of a stemDocument before building the wordcloud. It's working properly now.

How to parse INI like configuration files with R?

Is there an R function for parsing INI like configuration files?
While searching I only found this discussion.
Here is an answer that was given to exact the same question on r-help in 2007 (thanks to #Spacedman for pointing this out):
Parse.INI <- function(INI.filename)
{
connection <- file(INI.filename)
Lines <- readLines(connection)
close(connection)
Lines <- chartr("[]", "==", Lines) # change section headers
connection <- textConnection(Lines)
d <- read.table(connection, as.is = TRUE, sep = "=", fill = TRUE)
close(connection)
L <- d$V1 == "" # location of section breaks
d <- subset(transform(d, V3 = V2[which(L)[cumsum(L)]])[1:3],
V1 != "")
ToParse <- paste("INI.list$", d$V3, "$", d$V1, " <- '",
d$V2, "'", sep="")
INI.list <- list()
eval(parse(text=ToParse))
return(INI.list)
}
Actually, I wrote a short and presumably buggy function (i.e. not covering all corner cases) which works for me now:
read.ini <- function(x) {
if(length(x)==1 && !any(grepl("\\n", x))) lines <- readLines(x) else lines <- x
lines <- strsplit(lines, "\n", fixed=TRUE)[[1]]
lines <- lines[!grepl("^;", lines) & nchar(lines) >= 2] # strip comments & blank lines
lines <- gsub("\\r$", "", lines)
idx <- which(grepl("^\\[.+\\]$", lines))
if(idx[[1]] != 1) stop("invalid INI file. Must start with a section.")
res <- list()
fun <- function(from, to) {
tups <- strsplit(lines[(from+1):(to-1)], "[ ]*=[ ]*")
for (i in 1:length(tups))
if(length(tups[[i]])>2) tups[[i]] <- c(tups[[i]][[1]], gsub("\\=", "=", paste(tail(tups[[i]],-1), collapse="=")))
tups <- unlist(tups)
keys <- strcap(tups[seq(from=1, by=2, length.out=length(tups)/2)])
vals <- tups[seq(from=2, by=2, length.out=length(tups)/2)]
sec <- strcap(substring(lines[[from]], 2, nchar(lines[[from]])-1))
res[[sec]] <<- setNames(vals, keys)
}
mapply(fun, idx, c(tail(idx, -1), length(lines)+1))
return(res)
}
where strcap is a helper function that capitalizes a string:
strcap <- function(s) paste(toupper(substr(s,1,1)), tolower(substring(s,2)), sep="")
There are also some C solutions for this, like inih or libini that might be useful. I did not try them out, though.

Resources