Topic label of each document in LDA model using textmineR

Topic label of each document in LDA model using textmineR - r

I'm using textmineR to fit a LDA model to documents similar to https://cran.r-project.org/web/packages/textmineR/vignettes/c_topic_modeling.html. Is it possible to get the topic label for each document in the data set?
>library(textmineR)
>data(nih_sample)
> # create a document term matrix
> dtm <- CreateDtm(doc_vec = nih_sample$ABSTRACT_TEXT,doc_names =
nih_sample$APPLICATION_ID, ngram_window = c(1, 2), stopword_vec =
c(stopwords::stopwords("en"), stopwords::stopwords(source = "smart")),lower
= TRUE, remove_punctuation = TRUE,remove_numbers = TRUE, verbose = FALSE,
cpus = 2)
>dtm <- dtm[,colSums(dtm) > 2]
>set.seed(123)
> model <- FitLdaModel(dtm = dtm, k = 20,iterations = 200,burnin =
180,alpha = 0.1, beta = 0.05, optimize_alpha = TRUE, calc_likelihood =
TRUE,calc_coherence = TRUE,calc_r2 = TRUE,cpus = 2)
then adding the labels to the model:
> model$labels <- LabelTopics(assignments = model$theta > 0.05, dtm = dtm,
M = 1)
now I want the topic labels for each of 100 document in nih_sample$ABSTRACT_TEXT

Are you looking to label each document by the label of its most prevalent topic? IF so, this is how you could do it:
# convert labels to a data frame so we can merge
label_df <- data.frame(topic = rownames(model$labels), label = model$labels, stringsAsFactors = FALSE)
# get the top topic for each document
top_topics <- apply(model$theta, 1, function(x) names(x)[which.max(x)][1])
# convert the top topics for each document so we can merge
top_topics <- data.frame(document = names(top_topics), top_topic = top_topics, stringsAsFactors = FALSE)
# merge together. Now each document has a label from its top topic
top_topics <- merge(top_topics, label_df, by.x = "top_topic", by.y = "topic", all.x = TRUE)
This kind of throws away some information that you'd get from LDA though. One advantage of LDA is that each document can have more than one topic. Another is that we can see how much of each topic is in that document. You can do that here by
# set the plot margins to see the labels on the bottom
par(mar = c(8.1,4.1,4.1,2.1))
# barplot the first document's topic distribution with labels
barplot(model$theta[1,], names.arg = model$labels, las = 2)

Related

Complexheatmap with multiple files plotting

I would like to use Complexheatmap for multiple files for plotting individual data frame or files .
So far I was able to do this as for small subset of files.
Reading files as list
list_of_files <- list.files('Model_hmap/',pattern = '\\.txt$', full.names = TRUE)
#Further arguments to read.csv can be passed in ...
#all_csv <- lapply(list_of_files,read_delim,delim = "\t", escape_double = FALSE,trim_ws = TRUE)
all_csv <- lapply(list_of_files,read.table,strip.white = FALSE,check.names = FALSE,header=TRUE,row.names=1)
#my_names = c("gene","baseMean","log2FoldChange","lfcSE","stat","pvalue","padj","UP_DOWN")
my_names = c("Symbol","baseMean","log2FoldChange","lfcSE","stat","pvalue","padj","UP_DOWN")
#my_names = c['X2']
#my_names = c("Peak","annotation","ENSEMBL","log2FoldChange","padj","UP_DOWN")
result_abd = lapply(all_csv, FUN = function(x) subset(x, select=-c(1:7,155)))
names(result_abd) <- gsub(".txt","",
list.files("Model_hmap/",full.names = FALSE),
fixed = TRUE)
Then Scaling the data
fun <- function(result_abd) {
p <- t(scale(t(result_abd[,1:ncol(result_abd)])))
}
p2 <- mapply(fun, result_abd, SIMPLIFY = FALSE)
Next step was to use the metadata which i would like to annotate my heat-map
My metadata is as such
dput(head(metadata))
structure(list(patient = c("TCGA-AB-2856", "TCGA-AB-2849", "TCGA-AB-2971",
"TCGA-AB-2930", "TCGA-AB-2891", "TCGA-AB-2872"), prior_malignancy = c("no",
"no", "no", "no", "no", "no"), FAB = c("M4", "M0", "M4", "M2",
"M1", "M3"), Risk_Cyto = c("Intermediate", "Poor", "Intermediate",
"Intermediate", "Poor", "Good")), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
To read the above metadata I'm doing this below Im not sure if its the right way or approach.
list_of_files1 <- list.files('Model_hmap_meta/',pattern = '\\.txt$', full.names = TRUE)
#Further arguments to read.csv can be passed in ...
meta1 <- lapply(list_of_files1,read.table, row.names = 1,sep = "\t",header = TRUE)
Now I'm stuck at the above step Im not sure how do I pass the argument as list which i have done for the dataframe of my gene expression which I had calculated the zscore which is a list. So I think the metadata should be the same class if I have to use this .
For single file This is how I used to annotation into my final plot
metadata = read_delim("Model_hmap_meta/FAB_table.txt",delim = "\t", escape_double = FALSE,
trim_ws = TRUE)
head(metadata)
dim(metadata)
ann <- data.frame(metadata$FAB, metadata$Risk_Cyto)
colnames(ann) <- c('FAB', 'Risk_Cyto')
colours <- list('FAB' = c('M0' = 'red2', 'M1' = 'royalblue', 'M2'='gold','M3'='forestgreen','M4'='chocolate','M5'='Purple'),
'Risk_Cyto' = c('Good' = 'limegreen', 'Intermediate' = 'navy' , 'N.D.' ='magenta','Poor'='black'))
colAnn <- HeatmapAnnotation(df = ann,
which = 'col',
col = colours,
annotation_width = unit(c(1, 4), 'cm'),
gap = unit(1, 'mm'))
Now this is what I need to pass it to the list if I understand which I'm not able to do
My plotting function.
This is the code I use to plot.
hm1 <- Heatmap(heat,
col= colorRamp2(c(-2.6,-1,0,1,2.6),c("blue","skyblue","white","lightcoral","red")),
#heatmap_legend_param=list(at=c(-2.6,-1,0,1,2.6),color_bar="continuous",
# legend_direction="vertical", legend_width=unit(5,"cm"),
# title_position="topcenter", title_gp=gpar(fontsize=10, fontface="bold")),
name = "Z-score",
#Row annotation configurations
cluster_rows=T,
show_row_dend=FALSE,
row_title_side="right",
row_title_gp=gpar(fontsize=8),
show_row_names=FALSE,
row_names_side="left",
#Column annotation configuratiions
cluster_columns=T,
show_column_dend=T,
column_title="DE genes",
column_title_side="top",
column_title_gp=gpar(fontsize=15, fontface="bold"),
show_column_names = FALSE,
column_names_gp = gpar(fontsize = 12, fontface="bold"),
#Dendrogram configurations: columns
clustering_distance_columns="euclidean",
clustering_method_columns="complete",
column_dend_height=unit(10,"mm"),
#Dendrogram configurations: rows
clustering_distance_rows="euclidean",
clustering_method_rows="complete",
row_dend_width=unit(4,"cm"),
row_dend_side = "left",
row_dend_reorder = TRUE,
#Splits
border=T,
row_km = 1,
column_km = 1,
#plot params
#width = unit(5, "inch"),
#height = unit(4, "inch"),
#height = unit(0.4, "cm")*nrow(mat),
#Annotations
top_annotation = colAnn)
# plot heatmap
draw(hm1, annotation_legend_side = "right", heatmap_legend_side="right")
Objective
How do I wrap all the above into a small function where I can take input multiple files and plot them.
UPDATE
Data files
My data files my metadafile

Using the code you provided I made the following function (make_heatmap). Some of the read in statements are altered to match what I was working with on my machine. I also only used 2 of your files but it should work with all 4 that you're using.
This function will allow you to pass the counts matrix (which you normalize and set up before passing to the function). The assumption is that you're using the same metadata/annotation for each file you're passing. If you have different annotation files you could set up the heatmap annotation before the function and then pass that to the function. This is a bit more tedious though.
Usually the way that I set up my heatmap analyzes is that I have a script containing all of my functions (one for each type of heatmap I have to make) and then every time I need to make a new heatmap I have another script where I read in/prepare (ie median center) my counts matrix and then call the heatmap function I need.
list_of_files <- dir(pattern = 'MAP', full.names = TRUE)
#Further arguments to read.csv can be passed in ...
#all_csv <- lapply(list_of_files,read_delim,delim = "\t", escape_double = FALSE,trim_ws = TRUE)
all_csv <- lapply(list_of_files,read.table,strip.white = FALSE,check.names = FALSE,header=TRUE,row.names=1)
#my_names = c("gene","baseMean","log2FoldChange","lfcSE","stat","pvalue","padj","UP_DOWN")
my_names = c("Symbol","baseMean","log2FoldChange","lfcSE","stat","pvalue","padj","UP_DOWN")
#my_names = c['X2']
#my_names = c("Peak","annotation","ENSEMBL","log2FoldChange","padj","UP_DOWN")
result_abd = lapply(all_csv, FUN = function(x) subset(x, select=-c(1:7,155)))
names(result_abd) <- gsub(".txt","",
list.files("Model_hmap/",full.names = FALSE),
fixed = TRUE)
fun <- function(result_abd) {
p <- t(scale(t(result_abd[,1:ncol(result_abd)])))
}
p2 <- mapply(fun, result_abd, SIMPLIFY = FALSE)
# list_of_files1 <- list.files('Model_hmap_meta/',pattern = '\\.txt$', full.names = TRUE)
# #Further arguments to read.csv can be passed in ...
# meta1 <- lapply(list_of_files1,read.table, row.names = 1,sep = "\t",header = TRUE)
make_heatmap<-function(counts_matrix){
metadata = read.table("FAB_table.txt",sep = "\t", header=1)
head(metadata)
dim(metadata)
ann <- data.frame(metadata$FAB, metadata$Risk_Cyto)
colnames(ann) <- c('FAB', 'Risk_Cyto')
colours <- list('FAB' = c('M0' = 'red2', 'M1' = 'royalblue', 'M2'='gold','M3'='forestgreen','M4'='chocolate','M5'='Purple'),
'Risk_Cyto' = c('Good' = 'limegreen', 'Intermediate' = 'navy' , 'N.D.' ='magenta','Poor'='black'))
colAnn <- HeatmapAnnotation(df = ann,
which = 'col',
col = colours,
annotation_width = unit(c(1, 4), 'cm'),
gap = unit(1, 'mm'))
hm1 <- Heatmap(counts_matrix,
col= colorRamp2(c(-2.6,-1,0,1,2.6),c("blue","skyblue","white","lightcoral","red")),
#heatmap_legend_param=list(at=c(-2.6,-1,0,1,2.6),color_bar="continuous",
# legend_direction="vertical", legend_width=unit(5,"cm"),
# title_position="topcenter", title_gp=gpar(fontsize=10, fontface="bold")),
name = "Z-score",
#Row annotation configurations
cluster_rows=T,
show_row_dend=FALSE,
row_title_side="right",
row_title_gp=gpar(fontsize=8),
show_row_names=FALSE,
row_names_side="left",
#Column annotation configuratiions
cluster_columns=T,
show_column_dend=T,
column_title="DE genes",
column_title_side="top",
column_title_gp=gpar(fontsize=15, fontface="bold"),
show_column_names = FALSE,
column_names_gp = gpar(fontsize = 12, fontface="bold"),
#Dendrogram configurations: columns
clustering_distance_columns="euclidean",
clustering_method_columns="complete",
column_dend_height=unit(10,"mm"),
#Dendrogram configurations: rows
clustering_distance_rows="euclidean",
clustering_method_rows="complete",
row_dend_width=unit(4,"cm"),
row_dend_side = "left",
row_dend_reorder = TRUE,
#Splits
border=T,
row_km = 1,
column_km = 1,
#plot params
#width = unit(5, "inch"),
#height = unit(4, "inch"),
#height = unit(0.4, "cm")*nrow(mat),
#Annotations
top_annotation = colAnn)
# plot heatmap
draw(hm1, annotation_legend_side = "right", heatmap_legend_side="right")
}
make_heatmap(as.matrix(p2[[1]])) #just call the function with the counts matrix
make_heatmap(as.matrix(p2[[2]]))
If you need to output the heatmap to a pdf or something, you can do that before calling the function or you can put that command inside of the heatmap function (just make sure to call dev.off() inside the function too in that case).

Error in `V<-`(`tmp`, value = `vtmp`) : invalid indexing

I used the bibliometrix function in R, and want to plot some useful graphs.
library(bibliometrix)
??bibliometrix
D<-readFiles("E:\\RE\\savedrecs.txt")
M <- convert2df(D,dbsource = "isi", format= "plaintext")
results <- biblioAnalysis(M ,sep = ";" )
S<- summary(object=results,k=10, pause=FALSE)
plot(x=results,k=10,pause=FALSE)
options(width=100)
S <- summary(object = results, k = 10, pause = FALSE)
NetMatrix <- biblioNetwork(M1, analysis = "co-occurrences", network = "author_keywords", sep = ";")
S <- normalizeSimilarity(NetMatrix, type = "association")
net <- networkPlot(S, n = 200, Title = "co-occurrence network",type="fruchterman", labelsize = 0.7, halo = FALSE, cluster = "walktrap",remove.isolates=FALSE, remove.multiple=FALSE, noloops=TRUE, weighted=TRUE)
res <- thematicMap(net, NetMatrix, S)
plot(res$map)
But in the net <- networkPlot(S, n = 200, Title = "co-occurrence network",type="fruchterman", labelsize = 0.7, halo = FALSE, cluster = "walktrap",remove.isolates=FALSE, remove.multiple=FALSE, noloops=TRUE, weighted=TRUE), it shows error
Error in V<-(*tmp*, value = *vtmp*) : invalid indexing
. Also I cannot do the CR, it always shows unlistCR. I cannot use the NetMatrix function neither.
Some help me plsssssssss

The problem is in the data itself not in the code you presented. When I downloaded the data from bibliometrix.com and changed M1 to M (typo?) in biblioNetwork function call everything worked perfectly. Please see the code below:
library(bibliometrix)
# Plot bibliometric analysis results
D <- readFiles("http://www.bibliometrix.org/datasets/savedrecs.txt")
M <- convert2df(D, dbsource = "isi", format= "plaintext")
results <- biblioAnalysis(M, sep = ";")
S <- summary(results)
plot(x = results, k = 10, pause = FALSE)
# Plot Bibliographic Network
options(width = 100)
S <- summary(object = results, k = 10, pause = FALSE)
NetMatrix <- biblioNetwork(M, analysis = "co-occurrences", network = "author_keywords", sep = ";")
S <- normalizeSimilarity(NetMatrix, type = "association")
net <- networkPlot(S, n = 200, Title = "co-occurrence network", type = "fruchterman",
labelsize = 0.7, halo = FALSE, cluster = "walktrap",
remove.isolates = FALSE, remove.multiple = FALSE, noloops = TRUE, weighted = TRUE)
# Plot Thematic Map
res <- thematicMap(net, NetMatrix, S)
str(M)
plot(res$map)

Skip empty panel using lattice package, R programming

I want to skip a empty panel using lattice package in R.
set.seed(1)
df1 <- data.frame("treatment" = c(rep("A",16),rep("B",16),rep("C",16)),
"disease_type" = c(rep("1",8),rep("2",8)),
"days_after_application" = rep(c(rep("10-24",4),rep("24-48",4)),6),
"severity" = rnorm(48, mean = 80, sd = 5))
df1[(df1$disease_type == "2" & df1$days_after_application == "24-48"),"severity"] <- NA
library(lattice)
figure1 <- bwplot(treatment~severity|days_after_application+disease_type,
data = df1,layout = c(2,2),
strip = strip.custom(strip.names = TRUE))
jpeg("figure1.jpeg")
print(figure1)
dev.off()
Here is what I get
My question is how I can remove/skip empty panel in the top right WITHOUT changing layout?
I have tried following code. However, it doesn't work.
figure2 <- bwplot(treatment~severity|days_after_application+disease_type,
data = df1,layout = c(2,2),
strip = strip.custom(strip.names = TRUE),
skip = c(FALSE,FALSE,FALSE,TRUE))
jpeg("figure2.jpeg")
print(figure2)
dev.off()
Here is what I got
I also tried following codes. But it is not what I want since I do want 2 levels strips.
df1[(df1$disease_type == "2" & df1$days_after_application == "24-48"),] <- NA
bwplot(treatment~severity|interaction(days_after_application,disease_type),
data = df1,layout = c(2,2),
strip = strip.custom(strip.names = TRUE))
Thank you!

Get help from a Professor in Temple University.
Here is his solution:
figure4 <- bwplot(treatment~severity|days_after_application+disease_type,
data = df1,layout = c(2,2),
strip = strip.custom(strip.names = TRUE),
skip = c(FALSE,FALSE,FALSE,TRUE),
scales=list(alternating=FALSE), ## keep x-scale on bottom
between=list(x=1, y=1)) ## space between panels
pdf("figure4%03d.pdf",onefile = FALSE) ## force two pages in file.
print(figure4)
dev.off()

R package mlr Multilabel Text Classification: how to classify new data

I found this code in a tutorial about multilabel classification with package mlr.
library("mlr")
yeast = getTaskData(yeast.task)
labels = colnames(yeast)[1:14]
yeast.task = makeMultilabelTask(id = "multi", data = yeast, target = labels)
lrn.br = makeLearner("classif.rpart", predict.type = "prob")
lrn.br = makeMultilabelBinaryRelevanceWrapper(lrn.br)
mod = train(lrn.br, yeast.task, subset = 1:1500, weights = rep(1/1500, 1500))
pred = predict(mod, task = yeast.task, subset = 1:10)
pred = predict(mod, newdata = yeast[1501:1600,])
I understand the structure of the dataset yeast, but I do not understand how to use the code when I have new data which I want to classify because then I wouldn´t have any TRUE or FALSE values for the labels. Actually I would have some training data with the same structure as yeast but for my new data the columns 1:14 would be missing.
Am I missunderstanding something? If not: How can I use the code correctly?
Edit:
Here´s a sample code how I would use the code:
library("tm")
train.data = data.frame("id" = c(1,1,2,3,4,4), "text" = c("Monday is nice weather.", "Monday is nice weather.", "Dogs are cute.", "It is very rainy.", "My teacher is angry.", "My teacher is angry."), "label" = c("label1", "label2", "label3", "label1", "label4", "label5"))
test.data = data.frame("id" = c(5,6), "text" = c("Next Monday I will meet my teacher.", "Dogs do not like rain."))
train.data$text = as.character(train.data$text)
train.data$id = as.character(train.data$id)
train.data$label = as.character(train.data$label)
test.data$text = as.character(test.data$text)
test.data$id = as.character(test.data$id)
### Bring training data into structure
train.data$label = make.names(train.data$label)
labels = unique(train.data$label)
# DocumentTermMatrix for all texts
texts = unique(c(train.data$text, test.data$text))
docs <- Corpus(VectorSource(unique(texts)))
terms <-DocumentTermMatrix(docs)
m <- as.data.frame(as.matrix(terms))
# Logical columns for labels
test = data.frame("id" = train.data$id, "topic"=train.data$label)
test2 = as.data.frame(unclass(table(test)))
test2[,c(1:ncol(test2))] = as.logical(unlist(test2[,c(1:ncol(test2))]))
rownames(test2) = unique(test$id)
# Bind columns from dtm
termsDf = cbind(test2, m[1:nrow(test2),])
names(termsDf) = make.names(names(termsDf))
### Create Multilabel Task
classify.task = makeMultilabelTask(id = "multi", data = termsDf, target = labels)
### Now the model
lrn.br = makeLearner("classif.rpart", predict.type = "prob")
lrn.br = makeMultilabelBinaryRelevanceWrapper(lrn.br)
mod = train(lrn.br, classify.task)
### How can I predict for test.data?
So, the problem is that I do not have any labels for test.data because that is what I would actually like to compute?
Edit2:
When I simply use
names(m) = make.names(names(m))
pred = predict(mod, newdata = m[(nrow(termsDf)+1):(nrow(termsDf)+nrow(test.data)),])
the result is for both texts the same and really not that I would expect.

Warning message from ggfortify::ggbiplot

I am trying to use ggbiplotfrom ggfortify package. It seems its working fine but I am getting warning message as follows,
mdl <- pls::plsr(mpg ~ ., data = mtcars, scale = T)
scrs <- data.frame(pls::scores(mdl)[])
loads <- data.frame(pls::loadings(mdl)[])
ggfortify::ggbiplot(scrs, loads,
label.label = rownames(scrs), asp = 1, label = T, label.size = 3,
loadings = T, loadings.label = T, loadings.label.label = rownames(loads))
Warning messages:
1: In if (value %in% columns) { :
the condition has length > 1 and only the first element will be used
2: In if (value %in% columns) { :
the condition has length > 1 and only the first element will be used
Have I taken any wrong step or is it a bug.

According to the ggbiplot documentation, the label.label= parameter expects the column names from which to pull the names; it does not expect a vector of names. Same goes for loadings.label.label=. (ggplot and most tidyverse functions don't like rownames very much -- better to make them a proper column)
scrs$ID <- rownames(scrs)
loads$ID <- rownames(loads)
ggfortify::ggbiplot(scrs, loads,
label.label = "ID", asp = 1, label = T, label.size = 3,
loadings = T, loadings.label = T, loadings.label.label = "ID")

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Topic label of each document in LDA model using textmineR - r

Related

Complexheatmap with multiple files plotting

Error in `V<-`(`tmp`, value = `vtmp`) : invalid indexing

Skip empty panel using lattice package, R programming

R package mlr Multilabel Text Classification: how to classify new data

Warning message from ggfortify::ggbiplot

Categories

Resources

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Topic label of each document in LDA model using textmineR - r

Related

Complexheatmap with multiple files plotting

Error in `V<-`(`*tmp*`, value = `*vtmp*`) : invalid indexing

Skip empty panel using lattice package, R programming

R package mlr Multilabel Text Classification: how to classify new data

Warning message from ggfortify::ggbiplot

Categories

Resources

Error in `V<-`(`tmp`, value = `vtmp`) : invalid indexing