I am new to topic modeling and was testing the lda.collapsed.gibbs.sampler() method by trying to "characterize" some 98 CVs. I first tried to do it using a corpus (as it is easier to do filtering etc) However this gave some unexpected results - probably because the lexicalize() function first converted this to an object with only 3 documents/objects
# method 1
a <-Corpus(DirSource(doc.folder,pattern = ".txt$"), readerControl = list(language="eng"))
a <- tm_map(a, content_transformer(removeNumbers))
a <- tm_map(a, content_transformer(removePunctuation))
a <- tm_map(a, content_transformer(stripWhitespace))
a <- tm_map(a, content_transformer(tolower))
lex <- lexicalize(a)
result = lda.collapsed.gibbs.sampler(lex$documents, 8,lex$vocab, 30, 0.1,0.1, initial = NULL, burnin = NULL, compute.log.likelihood = T)
length(a) # output: [1] 98
length(lex$documents) # output: [1] 3, even though I expect 98
dim(result$document_sums) # output: [1] 8 3 even though I expect 8 98
However when I directly used the cv text as a vector, it gave the expected results
# method 2
filenames = list.files(path=doc.folder,pattern=".txt$",full.names = T)
df <- data.frame(stringsAsFactors=FALSE)
for (filename in filenames){
myfile = file(filename)
df <- rbind(df,cbind(name=file_path_sans_ext(basename(filename)),text=paste(readLines(myfile),collapse=" ")))
close(myfile)
}
# the following avoids an error due to french words etc being used
df[,"text"] <- sapply(df[,"text"],iconv,"WINDOWS-1252","UTF-8")
lex <- lexicalize(df[,"text"])
result = lda.collapsed.gibbs.sampler(lex$documents, 8,lex$vocab, 30, 0.1,0.1, initial = NULL, burnin = NULL, compute.log.likelihood = T)
NROW(df) # output: [1] 98
length(lex$documents) # output: [1] 98 as expected
dim(result$document_sums) # output: [1] 8 98 as expected
Related
I am working with the R programming language.
I am trying to count the first time a certain pattern (e.g. ABCD) appears in a random string (e.g. ACABCDCDBCABCDBC - answer =6 ). I wrote a function to do this:
library(stringr)
letters <- c("A", "B", "C", "D")
results <- list()
for (i in 1:100)
{
iteration_i = i
letters_i = paste(sample(letters, 100, replace=TRUE, prob=c(0.25, 0.25, 0.25, 0.25)),collapse="")
position_i = str_locate(letters_i, "ADBC")
results_tmp = data.frame(iteration_i , letters_i, position_i)
results[[i]] <- results_tmp
}
results_df <- do.call(rbind.data.frame, results)
This looks something like this now (note: I don't think this is correct - in row 5, I see ABCD at the beginning of the row, but its being recorded as NA for some reason):
iteration_i letters_i start end
1 1 BACDCCCDCCCDCDDBBCBBAACACBBBBAAABDDDACAABDDABBABADCDDCDACCBBBCABCDABCDCCCDADDDBADBDCADAABDBDCDCAACCB NA NA
2 2 CACACCCCDCCBADACBBAADBCABBAAAAADBDDBCADCAAADADAAABDCABBAABABBCBDADCDDDDCDBADDBDCBCDDDBDCDDAACBBBBACA 20 23
3 3 CDCBDAABDDDDADBAAABBADAADBDDDBDADDCABADDDCDABBBCBCBBACBBDADABBCDCCACDBCDCDDBDBADBCDCADDADDDBDBAAABBD 79 82
4 4 ADBCDBADADBAAACAADACACACACBDDCACBDACCBDAAABDBAAAABBCCDBADADDADCBCABCBAABDCBCDCDACDCCDBADCBDDAADBCDAC 1 4
5 5 D**ABCD**DDCCBCDABADBBBBCDBCADCBBBDCAAACACCCBCBCADBDDABBACACBDABAAACCAAAAACCCCBCBCCABABDDADBABDDDCCDDCCC NA NA
6 6 DDDDDBDDDDBDDDABDDADAADCABCDAABBCCCDAABDDAACBDABBBBBABBCBDADBDCCAAADACCBCDDBDCAADCBBBCACDBBADDDDCABC NA NA
Currently, I am only generating 100 letters and hoping that this is enough to observe the desired pattern (sometimes this doesn't happen, notice the NA's) - is there a way to add a WHILE LOOP to what I have written to keep generating letters until the desired pattern first appears?
Can someone please show me how to do this?
Thanks!
The loop is a repeat loop, not while, that only breaks when the pattern is found. I have set the results list length to 2, there's no point in making it bigger just to test the code.
library(stringr)
Letters <- c("A", "B", "C", "D")
Pattern <- "ADBC"
n <- 2L
set.seed(2022)
results <- vector("list", length = n)
for (i in seq.int(n)) {
repeat {
l <- sample(Letters, 100, replace = TRUE, prob=c(0.25, 0.25, 0.25, 0.25))
letters_i <- paste(l, collapse = "")
position_i <- str_locate(letters_i, pattern = Pattern)
if(any(!is.na(position_i))) break
}
results_tmp <- data.frame(iteration = i, letters = letters_i, position_i)
results[[i]] <- results_tmp
}
results_df <- do.call(rbind.data.frame, results)
results_df
#> iteration letters start end
#> 1 1 ADBDBDBBCABBBDDBADDAADCBBADACACDCCBBADAADCDDABADCABCDCDDCCCBDDAABACCBDAAAADBDDCCCCADBCBBDABBDCCCBADD 83 86
#> 2 2 DDBDBDBCDDBDBBBDBABBCCBBCCBDBDABBAAABACABADCCBBABADBCCCDABABBDBADCADCABDDDAAACCBDCAACACACBBDDDACCDDC 50 53
Created on 2022-06-11 by the reprex package (v2.0.1)
I have a vector as follows:
playtimes <- c("1H18M20S", "1H27M5S", "18M27S", "56M38S", "21S")
and I want to convert these to playtimes in second. For example, the resulting vector would be something like this:
playtimeInSeconds <- c(4700, 5225, 1107, 3398, 21)
Im having trouble with separating the strings correctly based on the H, M and S. I wrote the following that works for the playtimes under 1 hour
minutes <- gsub("M.*", "", playtime)
seconds <- gsub(".*M", "", playtime) %>%
gsub("S", "", .)
totalPlaytime <- as.numeric(minutes)*60 + as.numeric(seconds)
But Im not sure how to tackle the H portion of some strings.
You could strsplit and adapt the length of the list elements reversely to 3 which allows you to use sapply to get a matrix where you apply the matrix product %*%.
m <- sapply(strsplit(p, 'H|M|S'), \(x) as.double(rev(`length<-`(rev(x), 3))))
res <- as.vector(t(replace(m, is.na(m), 0)) %*% rbind(3600, 60, 1))
res
# [1] 4700 5225 1107 3398 21
interesting problem. here is a solution that potentially could be more efficient but does the job
# function from https://www.statworx.com/de/blog/strsplit-but-keeping-the-delimiter/
strsplit <- function(x,
split,
type = "remove",
perl = FALSE,
...) {
if (type == "remove") {
# use base::strsplit
out <- base::strsplit(x = x, split = split, perl = perl, ...)
} else if (type == "before") {
# split before the delimiter and keep it
out <- base::strsplit(x = x,
split = paste0("(?<=.)(?=", split, ")"),
perl = TRUE,
...)
} else if (type == "after") {
# split after the delimiter and keep it
out <- base::strsplit(x = x,
split = paste0("(?<=", split, ")"),
perl = TRUE,
...)
} else {
# wrong type input
stop("type must be remove, after or before!")
}
return(out)
}
# convert to seconds
to_seconds <- c(H = 60 * 60,
M = 60,
S = 1)
get_seconds <- function(value, unit) {
value * to_seconds[unit]
}
# example vector
playtimes <- c("1H18M20S", "1H27M5S", "18M27S", "56M38S", "21S")
# extract time parts
times <- strsplit(playtimes,
split = "[A-Z]",
type = "after")
times
#> [[1]]
#> [1] "1H" "18M" "20S"
#>
#> [[2]]
#> [1] "1H" "27M" "5S"
#>
#> [[3]]
#> [1] "18M" "27S"
#>
#> [[4]]
#> [1] "56M" "38S"
#>
#> [[5]]
#> [1] "21S"
# calculate each time in seconds
sapply(times,
function(t) {
# split numeric and unit part
t_split <- strsplit(x = t,
split = "[A-Z]",
type = "before")
# calculate seconds for each unit part
times_in_seconds <- get_seconds(value = as.numeric(sapply(t_split, `[`, 1)),
unit = sapply(t_split, `[`, 2))
# sum of all parts
sum(times_in_seconds)
})
#> [1] 4700 5225 1107 3398 21
I followed the example given in the 3rd answer here and made the following
playtime <- sapply(playtime, function(x){paste(paste(rep(0, 3 - str_count(x, '[0-9]+')), collapse = ' '), x)})
totalPlaytime <- time_length(hms(playtime))
Short, sweet, and checks for potential errors where the playtime is less that 1 hr or less than 1 min.
I'm trying to set up a way in R to print details of an each assignment while R code is run. So, for example, if the code x <- 1 is run then x has been assigned 1 will automatically be printed.
Is this possible?
I have two thoughts on how this might be done but can't figure out if either is possible.
redefine the = primitive so that it also prints a message
have an assignment trigger another function to run
one possible solution, but requires editing the code would be
# custom assignment function -----------------------------------------------------------------
`%<-%` <- function (lhs, rhs) {
cl <- match.call()
lhs <- substitute(lhs)
env <- parent.frame()
message("Info: `", lhs, "` defined as `", enquote(cl$rhs)[2], "`")
invisible(eval(assign(x = paste(lhs),
value = rhs,
envir = env))
)
}
# some tests ----------------------------------------------------------------------------------
ad %<-% c(1,2,33)
#> Info: `ad` defined as `c(1, 2, 33)`
ac %<-% 22
#> Info: `ac` defined as `22`
ad %<-% 22
#> Info: `ad` defined as `22`
df <- mtcars
df %<-% mtcars
#> Info: `df` defined as `mtcars`
If you don't want to modify files, you can define a modified source() function to replace the assignments with the newly defined %<-% function.
source_loudly <- function(filePath, ...) {
file_con <- file(filePath, open = "r")
txt <- readLines(file_con)
close(file_con)
txt_mod <- gsub(pattern = "<-", replace = "%<-%", x = txt)
source(textConnection(txt_mod), ...)
}
filePath <- "R/bits/example.R" # point to a local file on your pc
source_loudly(filePath = filePath, echo = T)
Created on 2021-03-19 by the reprex package (v1.0.0)
Here's a getter/setter hack that comes close without costing too much. While it does require you to change existing code, it has the benefit that you can change the initial assignment to list instead of tracer and everything continues to work unchanged.
tracer <- local({
.e <- NULL
function(..., name = "unk") {
.e <<- list(...)
.e$.name <<- name
`class<-`(.e, c("tracer", "environment"))
}
})
`[.tracer` <- `[[.tracer` <- `$.tracer` <- function(x, i) {
cat(sprintf("get: %s\n", deparse(substitute(i))))
NextMethod()
}
`[<-.tracer` <- `[[<-.tracer` <- `$<-.tracer` <- function(x, i, value) {
cat(sprintf("set: %s <- %s\n", deparse(substitute(i)),
substr(paste(deparse(substitute(value)), collapse = " "), 1, 80)))
NextMethod()
}
Notes:
deparse tends to split long lines into a vector of strings; this is mitigated here with paste(..., collapse=" ");
... but long literal values (e.g., frames) can be a bit annoying in the logs, so I arbitrarily chose substr(., 1, 80) as a reasonable size to log.
this hints at one problem I'll expand on below: this doesn't tell you which columns have been modified, just that the object has been updated.
Demonstration with "simple" objects:
quux <- tracer(a=1, b=2:3, d=list(pi, "a"), mt=mtcars[1:2,])
quux$a
# get: "a"
# [1] 1
quux$a <- 11
# set: "a" <- 11
quux$b
# get: "b"
# [1] 2 3
quux$b <- 2:5
# set: "b" <- 2:5
quux$b
# get: "b"
# [1] 2 3 4 5
So far, so good. Now onto the list:
quux$d
# get: "d"
# [[1]]
# [1] 3.141593
# [[2]]
# [1] "a"
quux$d[[1]]
# get: "d"
# [1] 3.141593
quux$d[[1]] <- pi^2
# get: "d"
# set: "d" <- list(9.86960440108936, "a")
The latter needs some explanation, notably about the order of operations. The assignment is really `[[<-`(quux$d, 1, pi^2), which is not traced. This adjusts the first element of the list, and then assigns this new list back to quux$d, where our $<-.tracer sees that full-list reassignment.
That is not completely unreasonable for small objects, but it becomes a little more annoying with larger objects:
quux$mt$cyl
# get: "mt"
# [1] 6 6
quux$mt$cyl <- quux$mt$cyl + 5
# get: "mt"
# get: "mt"
# set: "mt" <- structure(list(mpg = c(21, 21), cyl = c(11, 11), disp = c(160, 160), hp = c(110, 110), drat = c(3.9, 3.9), wt = c(2.62, 2.875 ), qsec = c(16.46, 17.02), vs = c(0, 0), am = c(1, 1), gear = c(4, 4), c
quux$mt$cyl
# get: "mt"
# [1] 11 11
Similarly, for an assignment we see both the first "get" step and then the whole-object-reassignment. (It is cutoff because I used substr(., 1, 80).)
Also, note that in both quux$d and quux$mt, the tracer functions never see the sub-element or column being adjusted. Since R orders the operations as it does, our tracer functions cannot reveal what is going on there (easily).
Now, when you're ready to remove this level of activity-logging, just replace your initial call to tracer(.) with list(.), and all operations continue to work but without logging.
quux <- list(a=1, b=2:3, d=list(pi, "a"), mt=mtcars[1:2,])
quux$a
# [1] 1
quux$a <- 11
quux$b
# [1] 2 3
quux$b <- 2:5
quux$b
# [1] 2 3 4 5
quux$d
# [[1]]
# [1] 3.141593
# [[2]]
# [1] "a"
quux$d[[1]]
# [1] 3.141593
quux$d[[1]] <- pi^2
quux$mt$cyl
# [1] 6 6
quux$mt$cyl <- quux$mt$cyl + 5
quux$mt$cyl
# [1] 11 11
I have around 70.000 frequent_words which I want to keep in a text corpus in the same order they appeared (order matters). Which i got like this:
dtm <- DocumentTermMatrix(txt_corpus, control = list(wordLengths=c(1, Inf)))
frequent_words <- findFreqTerms(dtm, lowfreq=50)
Just doing:
dtm <- DocumentTermMatrix(txt_corpus, control = list(wordLengths=c(1, Inf)))
dtm <- removeSparseTerms(dtm, 0.8)
Would not work since I need that same filtered text_corpus twice:
dtm <- DocumentTermMatrix(txt_corpus, control = list(wordLengths=c(1, Inf)))
BigramTokenizer <- function(x) unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
bidtm <- DocumentTermMatrix(txt_corpus, control = list(tokenize = BigramTokenizer))
I tried the code below:
keepWords <- content_transformer(function(x, words) {
regmatches(x,
gregexpr(paste0("(\\b", paste(words, collapse = "\\b|\\b"), "\\b)"), x, perl = T, ignore.case=T, useBytes = T)
, invert = T) <- " "
return(x)
})
txt_corpus <- tm_map(txt_corpus, keepWords, frequent_words)
When I run it I get the error:
Error in gregexpr(paste0("(\\b", paste(words, collapse = "\\b|\\b"), "\\b)"), :
assertion 'tree->num_tags == num_tags' failed in executing regexp: file 'tre-compile.c', line 634
Calls: preprocess ... tm_parLapply -> lapply -> FUN -> FUN -> regmatches<- -> gregexpr
Execution halted
This is caused due to the long regular expression. Removing non frequent words is out of the question since length(less_frequent_words) > 1.000.000 and takes to long with:
chunk <- 500
n <- length(less_frequent_words)
r <- rep(1:ceiling(n/chunk),each=chunk)[1:n]
d <- split(less_frequent_words, r)
for (i in 1:length(d)) {
txt_corpus <- tm_map(txt_corpus, removeWords, c(paste(d[[i]])))
}
I also tried something with joining but it gives me a unique text corpus in each iteration:
chunk <- 500
n <- length(frequent_words)
r <- rep(1:ceiling(n/chunk),each=chunk)[1:n]
d <- split(frequent_words, r)
joined_txt_corpus <- VCorpus(VectorSource(list()))
for (i in 1:length(d)) {
new_corpus <- tm_map(txt_corpus, keepWords, c(paste(d[[i]])))
joined_txt_corpus <- c(joined_txt_corpus, new_corpus)
txt_corpus <- tm_map(txt_corpus, removeWords, c(paste(d[[i]])))
}
txt_corpus <- joined_txt_corpus
Is there an efficient way to do the same selection like text_corpus <- tm_map(txt_corpus, keepWords, frequent_words) but with many words? Any help and hints are appreciated! Thanks!
Reproducable example:
library(tm)
data(crude)
txt_corpus <- crude
txt_corpus <- tm_map(txt_corpus, content_transformer(tolower))
txt_corpus <- tm_map(txt_corpus, removePunctuation)
txt_corpus <- tm_map(txt_corpus, stripWhitespace)
article_words <- c("a", "an", "the")
txt_corpus <- tm_map(txt_corpus, removeWords, article_words)
txt_corpus <- tm_map(txt_corpus, removeNumbers)
dtm <- DocumentTermMatrix(txt_corpus, control = list(wordLengths=c(1, Inf)))
frequent_words <- findFreqTerms(dtm, lowfreq=80)
dtm <- DocumentTermMatrix(txt_corpus, control = list(wordLengths=c(1, Inf), dictionary=frequent_words))
# Use many words just using frequent_words once works
# frequent_words <- c(frequent_words, frequent_words, frequent_words, frequent_words)
# keepWords function
keepWords <- content_transformer(function(x, words) {
regmatches(x,
gregexpr(paste0("(\\b", paste(words, collapse = "\\b|\\b"), "\\b)"), x, perl = T, ignore.case=T)
, invert = T) <- " "
return(x)
})
txt_corpus <- tm_map(txt_corpus, keepWords, frequent_words)
# Get bigram from text_corpus
BigramTokenizer <- function(x) unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
bidtm <- DocumentTermMatrix(txt_corpus, control = list(tokenize = BigramTokenizer))
bidtmm <- col_sums(bidtm)
bidtmm <- as.matrix(bidtmm)
print(bidtmm)
Output:
[,1]
in in 14
in of 21
in oil 19
in to 28
of in 21
of of 20
of oil 20
of to 29
oil in 18
oil of 18
oil oil 13
oil to 33
to in 32
to of 35
to oil 21
to to 41
I looked at your requirements and maybe a combination to tm and quanteda can help. See below.
Once you have a list of frequent words you can use quanteda in parallel to get the bigrams.
library(quanteda)
# set number of threads
quanteda_options(threads = 4)
my_corp <- corpus(crude) # corpus from tm can be used here (txt_corpus)
my_toks <- tokens(my_corp, remove_punct = TRUE) # add extra removal if needed
# Use list of frequent words from tm.
# speed gain should occur here
my_toks <- tokens_keep(my_toks, frequent_words)
# ngrams, concatenator is _ by default
bitoks <- tokens_ngrams(my_toks)
textstat_frequency(dfm(bitoks)) # ordered from high to low
feature frequency rank docfreq group
1 to_to 41 1 12 all
2 to_of 35 2 15 all
3 oil_to 33 3 17 all
4 to_in 32 4 12 all
5 of_to 29 5 14 all
6 in_to 28 6 11 all
7 in_of 21 7 8 all
8 to_oil 21 7 13 all
9 of_in 21 7 10 all
10 of_oil 20 10 14 all
11 of_of 20 10 8 all
12 in_oil 19 12 10 all
13 oil_in 18 13 11 all
14 oil_of 18 13 11 all
15 in_in 14 15 9 all
16 oil_oil 13 16 10 all
quanteda does have a topfeatures function, but it doesn't work like findfreqterms. Otherwise you could do it completely in quanteda.
If the dfm generation is taking too much memory, you can use as.character to transform the token object and use this either in dplyr or data.table. See code below.
library(dplyr)
out_dp <- tibble(features = as.character(bitoks)) %>%
group_by(features) %>%
tally()
library(data.table)
out_dt <- data.table(features = as.character(bitoks))
out_dt <- out_dt[, .N, by = features]
I wish to convert US Treasury Note futures prices to decimals (so 127-01+ = 127 + 1/32 + 1/64, and 127-01 1/4 = 127 + 1/32 + 1/128 etc).
I have had some success with cases where a fraction is present, but i'm failing when a fraction does not follow the big figure.
The source of the failure is that strsplit does not return an NA or empty object - so the order of the bits gets knocked about, and subsequent fractions are all wrong as a result.
Here's my case:
ty1 <- c("127-03", "126-31", "126-31+", "127-04+", "127-02+", "127-00+")
ty2 <- c("127-03", "126-31", "127-04+", "127-02+", "127-00+", "127",
"127-01 1/4", "128-01 3/4")
ty1_dec <- c(127+3/32, 126+31/32, 126+31/32+1/64, 127+4/32+1/64,
127+2/32+1/64, 127+0/32+1/64)
ty2_dec <- c(127+3/32, 126+31/32, 127+04/32+1/64, 127+2/32+1/64,
127+0/32+1/64, 127, 127+1/32+0/64+1/128, 128+1/32+1/64+1/128)
tFrac2Dec <-function(x){
hyphSplit <- strsplit(x, "-")
splitMatx <- matrix(unlist(hyphSplit), ncol=2, byrow=TRUE)
fracs <- t(apply(splitMatx[,2, drop=F], 1, function(X) substring(X, c(1,3), c(2,3))))
splitMatx[,2] <- (as.numeric(fracs[,1]) + ifelse(fracs[,2] == "+", 0.5, 0))/32
fracEval <- function(y){
eval(parse(text=paste(y[1], "+", y[2])))
}
apply(splitMatx,1,fracEval)
}
So things work for ty1 as all prices have - and a following fraction.
R> tFrac2Dec(ty1) == ty1_dec
[1] TRUE TRUE TRUE TRUE TRUE TRUE
R> tFrac2Dec(ty1)
[1] 127.0938 126.9688 126.9844 127.1406 127.0781 127.0156
But when there is a price that is only the big figure it fails.
> tFrac2Dec(ty2) == ty2_dec
Error in parse(text = paste(y[1], "+", y[2])) :
<text>:1:4: unexpected numeric constant
1: 01 1
^
In addition: Warning message:
In matrix(unlist(hyphSplit), ncol = 2, byrow = TRUE) :
data length [15] is not a sub-multiple or multiple of the number of rows [8]
I can see that it's due to the first strsplit step, but i cannot figure the solution. I've fiddled about with some if type solutions but nothing is working.
Is there a simple way to do this?
Something like this?
tFrac2Dec <- function(x) {
hyphSplit <- strsplit(x, "-")
ch1 <- sapply(hyphSplit,`[`, 1)
ch2 <- sapply(hyphSplit,`[`, 2)
x32 <- as.integer(substring(ch2, 1, 2))
x128 <- rep(0, length(ch1))
x128[grep("\\+$", ch2)] <- 2
x128[grep("1/4", ch2, fixed=TRUE)] <- 1
x128[grep("3/4", ch2, fixed=TRUE)] <- 3
dec <- x32/32 + x128/128
dec[is.na(dec)] <- 0
as.integer(ch1) + dec
}
tFrac2Dec(ty1)
## [1] 127.0938 126.9688 126.9844 127.1406 127.0781 127.0156
tFrac2Dec(ty2)
## [1] 127.0938 126.9688 127.1406 127.0781 127.0156 127.0000 127.0312