Print out the text value of the points on a cluster when using UMAP and HDBScan and BERT sentence transformer - bert-language-model

I have seen a number of questions similar to this but my cluster labels consist of sentence embeddings, thus a better question may be how do I get text values from the sentence embeddings?
How can I get from my sentence embeddings to print a text output?
umap_embeddings = umap.UMAP(n_neighbors=50,
n_components=5,
metric='cosine').fit_transform(embeddings)
cluster = hdbscan.HDBSCAN(min_cluster_size=3,
metric='euclidean',
cluster_selection_method='eom').fit(umap_embeddings)
# Prepare data
umap_data = umap.UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine', random_state=24).fit_transform(embeddings)
result = pd.DataFrame(umap_data, columns=['x', 'y'])
result['labels'] = cluster.labels_
# Visualize clusters
fig, ax = plt.subplots(figsize=(20, 10))
outliers = result.loc[result.labels == -1, :]
clustered = result.loc[result.labels != -1, :]
plt.scatter(outliers.x, outliers.y, color='#202020', s=25)
plt.scatter(clustered.x, clustered.y, c=clustered.labels, s=25, cmap='hsv_r'
)
some previous answers have suggested;
textdata_with_label_113 = textData[clusterer.labels_ == 113]
However, this returns the embedded value oppoesd to the text value.

With more time on the problem I realised that the embeddings are in the same sequence as the original DF.
therefore you can work back quite easily.
lbls=[]
#seperate the clustered labels into seperate lists (0,1,2,3)
for x in range(len(clustered.labels)):
lbls.append(clustered[clustered.labels == x])
df_desc=[]
# extract the rows from the data frame using the lbls list and use column 6 only in my case
for x in range(len(lbls)):
df_desc.append(df.iloc[lbls[x].index,5])
for i in range(4):
txt = "Cluster {number}"
print(txt.format(number = i))
print(df_desc[i])

Related

K-mer words in R

I am still new to R programming and I just have no idea how to write this same code below from python to R.
human_data is dataframe from CSV file. the word includes sequence of letters. Basically, I want to convert my 'word' column sequence of string into all possible k-mer words of length 6.
def getKmers(sequence, size=6):
return [sequence[x:x+size] for x in range(len(sequence) - size + 1)]
human_data['words'] = human_data.apply(lambda x: getKmers(x['sequence']), axis=1)
You could use the library quanteda too, in order to compute the k-mers (k-grams), the following code shows an example:
library(quanteda)
k = 6 # 6-mers
human_data = data.frame(sequence=c('abcdefghijkl', 'xxxxyyxxyzz'))
human_data$words <- apply(human_data, 1,
function(x) char_ngrams(unlist(tokens(x['sequence'],
'character')), n=k, concatenator = ''))
human_data
# sequence words
#1 abcdefghijkl abcdef, bcdefg, cdefgh, defghi, efghij, fghijk, ghijkl
#2 xxxxyyxxyzz xxxxyy, xxxyyx, xxyyxx, xyyxxy, yyxxyz, yxxyzz
I hope this helps, using R basic commands:
df = data.frame(words=c('asfdklajsjahk', 'dkajsadjkfggfh', 'kfjlhdaDDDhlw'))
getKmers = function(sequence, size=6) {
kmers = c()
for (x in 1:(nchar(sequence) - size + 1)) {
kmers = c(kmers, substr(sequence, x, x+size-1))
}
return(kmers)
}
sapply(df$words, getKmers)

Creating a function to loop columns through an equation in R

Solution (thanks #Peter_Evan!) in case anyone coming across this question has a similar issue
(Original question is below)
## get all slopes (lm coefficients) first
# list of subfields of interest to loop through
sf <- c("left_presubiculum", "right_presubiculum",
"left_subiculum", "right_subiculum", "left_CA1", "right_CA1",
"left_CA3", "right_CA3", "left_CA4", "right_CA4", "left_GC-ML-DG",
"right_GC-ML-DG")
# dependent variables are sf, independent variable common to all models in the inner lm() call is ICV
# applies the lm(subfield ~ ICV, dataset = DF) to all subfields of interest (sf) specified previously
lm.results <- lapply(sf, function(dv) {
temp.lm <- lm(get(dv) ~ ICV, data = DF)
coef(temp.lm)
})
# returns a list, where each element is a vector of coefficients
# do.call(rbind, ) will paste them together
lm.coef <- data.frame(sf = sf,
do.call(rbind, lm.results))
# tidy up name of intercept variable
names(lm.coef)[2] <- "intercept"
lm.coef
## set up all components for the equation
# matrix to store output
out <- matrix(ncol = length(sf), nrow = NROW(DF))
# name the rows after each subject
row.names(out) <- DF$Subject
# name the columns after each subfield
colnames(out) <- sf
# nested for loop that goes by subject (j) and subfield (i)
for(j in DF$Subject){
for (i in sf) {
slope <- lm.coef[lm.coef$sf == i, "ICV"]
out[j,i] <- as.numeric( DF[DF$Subject == j, i] - (slope * (DF[DF$Subject == j, "ICV"] - mean(DF$ICV))) )
}
}
# check output
out
===============
Original Question:
I have a dataframe (DF) with 13 columns (12 different brain subfields, and one column containing total intracranial volume(ICV)) and 50 rows (each a different participant). I'm trying to automate an equation being looped over every column for each participant.
The data:
structure(list(Subject = c("sub01", "sub02", "sub03", "sub04",
"sub05", "sub06", "sub07", "sub08", "sub09", "sub10", "sub11",
"sub12", "sub13", "sub14", "sub15", "sub16", "sub17", "sub18",
"sub19", "sub20"), ICV = c(1.50813, 1.3964237, 1.6703585, 1.4641886,
1.6351018, 1.5524641, 1.4445532, 1.6384505, 1.6152434, 1.5278011,
1.4788126, 1.4373356, 1.4109637, 1.3634952, 1.3853583, 1.4855268,
1.6082085, 1.5644998, 1.5617522, 1.4304141), left_subiculum = c(411.225013,
456.168033, 492.968477, 466.030173, 533.95505, 476.465524, 448.278213,
476.45566, 422.617374, 498.995121, 450.773906, 461.989663, 549.805272,
452.619547, 457.545623, 451.988333, 475.885847, 490.127968, 470.686415,
494.06548), left_CA1 = c(666.893596, 700.982955, 646.21927, 580.864234,
721.170599, 737.413139, 737.683665, 597.392434, 594.343911, 712.781376,
733.157168, 699.820162, 701.640861, 690.942843, 606.259484, 731.198846,
567.70879, 648.887718, 726.219904, 712.367433), left_presubiculum = c(325.779458,
391.252815, 352.765098, 342.67797, 390.885737, 312.857458, 326.916867,
350.657957, 325.152464, 320.718835, 273.406949, 305.623938, 371.079722,
315.058313, 311.376271, 319.56678, 348.343569, 349.102678, 322.39908,
306.966008), `left_GC-ML-DG` = c(327.037756, 305.63224, 328.945065,
238.920358, 319.494513, 305.153183, 311.347404, 259.259723, 295.369164,
312.022281, 324.200989, 314.636501, 306.550385, 311.399107, 295.108592,
356.197094, 251.098248, 294.76349, 317.308576, 301.800253), left_CA3 = c(275.17038,
220.862237, 232.542718, 170.088695, 234.707172, 210.803287, 246.861975,
171.90896, 220.83478, 236.600832, 246.842024, 239.677362, 186.599097,
224.362411, 229.9142, 293.684776, 172.179779, 202.18936, 232.5666,
221.896625), left_CA4 = c(277.614028, 264.575987, 286.605092,
206.378619, 281.781858, 258.517989, 269.354864, 226.269982, 256.384436,
271.393257, 277.928824, 265.051581, 262.307377, 266.924683, 263.038686,
306.133918, 226.364556, 262.42823, 264.862956, 255.673948), right_subiculum = c(468.762375,
445.35738, 446.536018, 456.73484, 521.041823, 482.768261, 487.2911,
456.39996, 445.392976, 476.146498, 451.775611, 432.740085, 518.170065,
487.642399, 405.564237, 487.188989, 467.854363, 479.268714, 473.212833,
472.325916), right_CA1 = c(712.973011, 717.815214, 663.637105,
649.614586, 711.844375, 779.212704, 862.784416, 648.925038, 648.180611,
760.761704, 805.943016, 717.486756, 801.853608, 722.213109, 621.676321,
791.672796, 605.35667, 637.981476, 719.805053, 722.348921), right_presubiculum = c(327.285242,
364.937865, 288.322641, 348.30058, 341.309111, 279.429847, 333.096795,
342.184296, 364.245998, 350.707173, 280.389853, 276.423658, 339.439377,
321.534798, 302.164685, 328.365751, 341.660085, 305.366589, 320.04127,
303.83284), `right_GC-ML-DG` = c(362.391907, 316.853532, 342.93274,
282.550769, 339.792696, 357.867386, 342.512721, 277.797528, 309.585721,
343.770416, 333.524912, 302.505077, 309.063135, 291.29361, 302.510461,
378.682679, 255.061044, 302.545288, 313.93902, 297.167161), right_CA3 = c(307.007404,
243.839349, 269.063801, 211.336979, 249.283479, 276.092623, 268.183349,
202.947849, 214.642782, 247.844657, 291.206598, 235.864996, 222.285729,
201.427853, 237.654913, 321.338801, 199.035108, 243.204203, 236.305659,
213.386702), right_CA4 = c(312.164065, 272.905586, 297.99392,
240.765062, 289.98697, 306.459566, 284.533068, 245.965817, 264.750571,
296.149675, 290.66935, 264.821461, 264.920869, 246.267976, 266.07378,
314.205819, 229.738951, 274.152503, 256.414608, 249.162404)), row.names = c(NA,
-20L), class = c("tbl_df", "tbl", "data.frame"))
The equation:
adjustedBrain(participant1) = rawBrain(participant1) - slope*[ICV(participant1) - (mean of all ICV measures included in the calculation of the slope)]
The code (which is not working and I was hoping for some pointers):
adjusted_Brain <- function(DF, subject) {
subfields <- colnames(select(DF, "left_presubiculum", "right_presubiculum",
"left_subiculum", "right_subiculum", "left_CA1", "right_CA1",
"left_CA3", "right_CA3", "left_CA4", "right_CA4", "left_GC-ML-DG",
"right_GC-ML-DG"))
out <- matrix(ncol = length(subfields), nrow = NROW(DF))
for (i in seq_along(subfields)) {
DF[i] = DF[DF$Subject == "subject", "i"] -
slope * (DF[DF$Subject == "subject", "ICV"] -
mean(DF$ICV))
}
}
Getting this error:
Error: Can't subset columns that don't exist.
x Column `i` doesn't exist.
A few notes:
The slopes for each subject for each subfield will be different (and will come from a regression) -> is there a way to specify that in the function so the slope (coefficient from the appropriate regression equation) gets called in?
I have my nrow set to the number of participants right now in the output because I'd like to have this run through EVERY subject across EVERY subfield and spit out a matrix with all the adjusted brain volumes... But that seems very complicated and so for now I will just settle for running each participant separately.
Any help is greatly appreciated!
As others have noted in the comments, there are quite a few syntax issues that prevent your code from running, as well as a few unstated requirements. That aside, I think there is enough to recommend a few improvements that you can hopefully build on. Here are the top line changes:
You likely don't need this to be a function, but rather a nested for loop (if you want to do this with base R). As written, the code isn't flexible enough to merit a function. If you intend to apply this many times across different datasets, a function might make sense. However, it will require a much larger rewrite.
Assuming you are fitting a simple regression via lm, then you can pull out the coefficient of interest via the $ operator and indexing (see below). Some thought will need to go into how to handle different models in the loop. Here, we assume you only need one coefficient from one model.
There are a few areas where the syntax is incorrect and a review of sub setting in base R would be helpful. Others have pointed out in the comments were some of these are.
Here is one approach were we loop through each subject (j) through each feature or subfield (i) and store them in a matrix (out). This is just an approach and will almost certainly need tweaking on your end!
#NOTE: the dataset your provided is saved as x in this example.
#fit a linear model - here we assume there is only one coef. of interest, but you may need to alter
# depending on how the slope changes in each calculation
reg <- lm(ICV ~ right_CA3, x)
# view the coeff.
reg$coefficients
# pull out the slope by getting the coeff. of interest (via index) from the reg object
slope <- reg$coefficients[[1]]
# list of features/subfeilds to loop through
sf <- c("left_presubiculum", "right_presubiculum",
"left_subiculum", "right_subiculum", "left_CA1", "right_CA1",
"left_CA3", "right_CA3", "left_CA4", "right_CA4", "left_GC-ML-DG",
"right_GC-ML-DG")
# matrix to store output
out <- matrix(ncol = length(sf), nrow = NROW(x))
#name the rows after each subject
row.names(out) <- x$Subject
#name the columns after each sub feild
colnames(out) <- sf
# nested for loop that goes by subject (j) and features/subfeilds (i)
for(j in x$Subject){
for (i in sf) {
out[j,i] <- as.numeric( x[x$Subject == j, i] - (slope * (x[x$Subject == j, "ICV"] - mean(x$ICV))) )
}
}
# check output
out

how to interpolate data within groups in R using seqtime?

I am trying to use seqtime (https://github.com/hallucigenia-sparsa/seqtime) to analyze time-serie microbiome data, as follow:
meta = data.table::data.table(day=rep(c(15:27),each=3), condition =c("a","b","c"))
meta<- meta[order(meta$day, meta$condition),]
meta.ts<-as.data.frame(t(meta))
otu=matrix(1:390, ncol = 39)
oturar<-rarefyFilter(otu, min=0)
rarotu<-oturar$rar
time<-meta.ts[1,]
interp.otu<-interpolate(rarotu, time.vector = time,
method = "stineman", groups = meta$condition)
the interpolation returns the following error:
[1] "Processing group a"
[1] "Number of members 13"
intervals
0
12
[1] "Selected interval: 1"
[1] "Length of time series: 13"
[1] "Length of time series after interpolation: 1"
Error in stinepack::stinterp(time.vector, as.numeric(x[i, ]), xout = xout, :
The values of x must strictly increasing
I tried to change method to "hyman", but it returns the error below:
Error in interpolateSub(x = x, time.vector = time.vector, method = method) :
Time points must be provided in chronological order.
I am using R version 3.6.1 and I am a bit new to R.
Please can anyone tell me what I am doing wrong/ how to go around these errors?
Many thanks!
I used quite some time stumbling around trying to figure this out. It all comes down to the data structure of meta and the resulting time variable used as input for the time.vector parameter.
When meta.ts is being converted to a data frame, all strings are automatically converted to factors - this includes day.
To adjust, you can edit your code to the following:
library(seqtime)
meta <- data.table::data.table(day=rep(c(15:27),each=3), condition =c("a","b","c"))
meta <- meta[order(meta$day, meta$condition),]
meta.ts <- as.data.frame(t(meta), stringsAsFactors = FALSE) # Set stringsAsFactors = FALSE
otu <- matrix(1:390, ncol = 39)
oturar <- rarefyFilter(otu, min=0)
rarotu <- oturar$rar
time <- as.integer(meta.ts[1,]) # Now 'day' is character, so convert to integer
interp.otu <- interpolate(rarotu, time.vector = time,
method = "stineman", groups = meta$condition)
As a bonus, read this blogpost for information on the stringsAsFactors parameter. Strings automatically being converted to Factors is a common bewilderment.

Microarray Limma package, in topTable function don't assign ID for probsets column

I tried a tutorial by Daniel Swan ,it works perfectly well. But I'm facing a problem in topTable function of limma package.
The "topTable" function create a "probeset list" but this probset list have not "ID" header (other columns name is their sample name, but Probe list column have not name (ID)).
At the result, when I am runing:
gene.symbols <- getSYMBOL(probeset.list$ID, "hgu133plus2")
I'm getting the following error
Error in .select(x, keys, columns, keytype = extraArgs[["kt"]], jointype = jointype):
'keys' must be a character vector
topTable is:
logFC AveExpr t P.Value adj.P.Val B
204779_s_at 7.367790 4.171707 72.77347 3.284937e-15 8.969850e-11 20.25762
207016_s_at 6.936667 4.027733 57.39252 3.694641e-14 5.044293e-10 19.44987
209631_s_at 5.192949 4.003992 51.24892 1.170273e-13 1.065182e-09 18.96660
my expression Set achieved by simpleaffy (gcrma) package.
I'm runing R 3.0.2 under windows 7 with latest bioconductor packages, simpleaffy_2.38.0 , limma_3.18.13 and anotation files: hgu133plus2.db_2.10.1 ,hgu133plus2probe_2.13.0, hgu133plus2cdf_2.13.0
I would be very thankful, if somebody could help me.
The IDs are not stored as an ID column, but as the rownames of the table. Change the line to:
gene.symbols <- getSYMBOL(rownames(probeset.list), "hgu133plus2")
If you want there to be an ID column instead of using row names, you can assign one with:
probeset.list$ID = rownames(probeset.list)
According to the documentation of the toptable function, the ID column will exist if and only if there are duplicated gene names:
If ‘fit’ had unique rownames, then the row.names of the above
data.frame are the same in sorted order. Otherwise, the row.names
of the data.frame indicate the row number in ‘fit’. If ‘fit’ had
duplicated row names, then these are preserved in the ‘ID’ column
of the data.frame, or in ‘ID0’ if ‘genelist’ already contained an
‘ID’ column.
In the other examples you've seen ID used, there must have been duplicate gene names in the input. This makes sense because R typically doesn't like having duplicated rownames (but has no problem having duplicate IDs in a column).
Hope my piece of working codes can make your question clear:
library(limma) # загружаем нужную библиотека
library(siggenes)
library(cluster)
library(stats)
data <- read.table("AneurismDataAllProbesGenesisLog2NormalizedExperAndGenes.tab", sep = "\t", header = TRUE) # read from file
q = as.matrix(data) # данные в матрицу
b = as.matrix(cbind(data[, 2:10], data[, 11:14])) # cмежные колонки данных
m = normalizeQuantiles(b, ties=TRUE)
f = data.frame(condition = c(0,0,0,0,0,0,0,0,0,1,1,1,1)) # дизайн
fit = lmFit(m, f) # линейная модель
e = eBayes(fit) # тест Байеса
volcanoplot(e, coef=1, highlight=5, names=data$GeneName, xlab="Log Fold Change", ylab="Log Odds", pch=19, cex=0.67, col = "dark blue") # график-вулкан
z = rownames(m) = data[, 1]
hc <- hclust(dist(m), "ave") # кластерграмма
plot(hc)
plot(hc, hang = -1)
print(e$coefficients) # output eBayes coefficients
print(e$p.value) # get out the P values
toptable(e) # select 10 most differentialy expressed genes, the disadvantage that it outputs only the gene row number and not the name
printresult <-toptable(e) # assign the result to a variable
write.csv(printresult, file = "eBayesTableAneurism", row.names = TRUE) # write to the file in the current folder
volcanoplot(e, coef=1, highlight=10, names=data[,1], xlab="Log Fold Change", ylab="Log Odds", pch=19, cex=0.67, col = "red") # график-вулкан c именами
volcanoplot(e, coef=1, highlight=5, names=data[,1], xlab="Log Fold Change", ylab="Log Odds", pch=19, cex=0.67, col = "blue") # график-вулкан с именами (Volcano with gene names)

R WGCNA Cytoscape hub genes

I have the following problem with
WGCNA - http://labs.genetics.ucla.edu/horvath/htdocs/CoexpressionNetwork/Rpackages/WGCNA/Tutorials/
Working on Section 1.6, Export of networks to external software (Cytoscape)
I'm currently trying to perform WGCNA on a set of genes and I'm having trouble getting the top x hub genes for each module. I am trying to export a network to Cytoscape and used the same method for getting the top x hub genes as outlined for exporting to VisANT.
# Select modules (only interested in one for now)
modules = c("greenyellow")
# Select module probes
probes = names(datExpr)
inModule = is.finite(match(bwModuleColors, modules))
modProbes = probes[inModule]
modGenes = annot$gene_symbols[match(modProbes, annot$geneID)]
# Select the corresponding Topological Overlap
modTOM = TOM[inModule, inModule]
dimnames(modTOM) = list(modProbes, modProbes)
# Restrict the network to the top 30 genes
nTop = 30
IMConn = softConnectivity(datExpr[, modProbes]
top = (order(-IMConn) <= nTop)
# Export the network into a fomat that Cytoscape can read
cyt = exportNetworkToCytoscape(modTOM[top, top],
edgeFile = paste("CytoscapeInput-edges-", paste(modules, collapse="-"), ".txt", sep = ""),
nodeFile = paste("CytoscapeInput-nodes-", paste(modules, collapse="-"), ".txt", sep = ""),
weight = TRUE,
threshold = 0.02,
nodeNames = modProbes,
altNodeNames = modGenes,
nodeAttr = bwModuleColors[inModule])
I've written a short loop to count the number of connections to each gene and it works as expected, but the xth gene consistently has zero connections. Let's say that x is 30. If I increase the cutoff to 31 hub genes, the 30th gene now shows connections to the others in the network, but the 31st gene shows nothing. In addition, this change increases AND decreases some of the number of connections to other genes in the network. This really bothers me, because connections should only be added, since the network is getting bigger by one gene, and the changes should be accounted for by the 30th gene, but this is not the case for the output.
# Split the cytoscape file into two parts: edge and node
node <- cyt$nodeData
edge <- cyt$edgeData
# The limit covers all of the connections in the edge file by determining the length of the column ‘fromNode’
limit <- length(edge$fromNode)
# Create an empty list to store the counts for each gene
counts = list()
# Loop for the genes going from 1 to the number of genes specified for the network, ‘nTop’
for (i in 1:nTop) {
# Reset the count for each new gene and specify the names of the gene of interest and the matching genes
name = node$nodeName[[i]]
count = 0
# Nested loop that searches for matches to the gene in question in both the ‘fromNode’ and ‘toNode’columns, and adds one to the count for each match.
for (j in 1:limit) {
matchName1 = edge$fromNode[[j]]
matchName2 = edge$toNode[[j]]
if (name == matchName1 || name == matchName2)
{count = count + 1}
}
# Create a string for the attribute in the correct format
attribute <- paste(name, "=", count)
# Adds the count to the list
counts <- c(counts, attribute)
}
# End of loop
The loop seems to be working as expected, so I'm thinking that the problem is with the network construction. I'm currently referring back to what I know about linear algebra, matrices and topology to try to see if the problem is the way they're being sorted or something like that, but it might just be the way that the exportNetworkToCytoscape() function works.
modules = "brown";
probes = rownames(datExpr_human) ======> data genes in row and samples in column.
inModule = is.finite(match(modules_human,modules))
modTOM = dissTOM_Human[inModule, inModule];
modProbes = probes[inModule];
dimnames(modTOM) = list(modProbes, modProbes)
nTop = 30;
datExpr = t(datExpr_human)
IMConn = softConnectivity(datExpr[, modProbes]);
top = (rank(-IMConn) <= nTop)
cyt = exportNetworkToCytoscape(modTOM[top, top],
edgeFile = paste("CytoscapeInput-edges-", paste(modules, collapse="-"), ".txt", sep=""),
nodeFile = paste("CytoscapeInput-nodes-", paste(modules, collapse="-"), ".txt", sep=""),
weighted = TRUE)

Resources