heatmap error when implementing as.matrix - r

When I use heatmap function to make heatmap of dataset, I get an error, I tried:
df1$family <- substr(as.character(df1$gene_id), 1, nchar(as.character(df1$gene_id))-2)
df01<-df1$family
df01m<-as.matrix(df01)
heatmap(df01m)
I get this error:
Error in heatmap(df01m): 'x' must be a numeric matrix
Traceback:
1. heatmap(df01m)
2. stop("'x' must be a numeric matrix")
The dataset is big, so I cut some of it:
structure(list(gene_id = structure(6:11, .Label = c("__alignment_not_unique",
"__ambiguous", "__no_feature", "__not_aligned", "__too_low_aQual",
"ENSG00000000005", "ENSG00000000419", "ENSG00000000457", "ENSG00000000460",
"ENSG00000000938", "ENSG00000000971", "ENSG00000001036", "ENSG00000001084",
"ENSG00000001167", "ENSG00000001460", "ENSG00000001461", "ENSG00000001497",
"ENSG00000001561", "ENSG00000001617", "ENSG00000001626", "ENSG00000001629",
"ENSG00000001630", "ENSG00000001631", "ENSG00000002016", "ENSG00000002079",
"ENSG00000002330", "ENSG00000002549", "ENSG00000002586", "ENSG00000002587",
"ENSG00000002726", "ENSG00000002745", "ENSG00000002746", "ENSG00000002822",
"ENSG00000002834", "ENSG00000002919", "ENSG00000002933", "ENSG00000003056",
"ENSG00000003096", "ENSG00000003137", "ENSG00000003147", "ENSG00000003249",
"ENSG00000003393", "ENSG00000003400", "ENSG00000003402", "ENSG00000003436",
"ENSG00000003509", "ENSG00000003756", "ENSG00000003987", "ENSG00000003989",
"ENSG00000004059", "ENSG00000004139", "ENSG00000004142", "ENSG00000004399",
"ENSG00000285989", "ENSG00000285990", "ENSG00000285991", "ENSG00000285992",
"ENSG00000285993", "ENSG00000285994"), class = "factor"), expr = c(6L,
754L, 447L, 426L, 5L, 1L)), row.names = c(NA, 6L), class = "data.frame")
head of the data set:
gene_id expr
<fct> <int>
1 ENSG00000000005 6
2 ENSG00000000419 754
3 ENSG00000000457 447
4 ENSG00000000460 426
5 ENSG00000000938 5
6 ENSG00000000971 1

The error shows that we need a numeric matrix. The substr function returns a character string. So, we can convert the substring vector to numeric
df01m <- as.matrix(as.numeric(df01))
Another issue is that heatmap requires a matrix with atleast 2 rows/2 columns. Here the as.matrix converts the vector to a single column matrix and it may not work

Related

Normalizing values depending on group in R [duplicate]

This question already has answers here:
Normalize by Group
(2 answers)
Closed 2 years ago.
I have this dataset:
> head(meltCalcium)
Time Cell Intensity
1 1 IntDen1 306852.5
2 2 IntDen1 302892.2
3 3 IntDen1 298258.6
4 4 IntDen1 300769.9
5 5 IntDen1 301971.8
6 6 IntDen1 302585.6
> tail(meltCalcium)
Time Cell Intensity
32531 659 IntDen49 47788.16
32532 660 IntDen49 47560.32
32533 661 IntDen49 47738.24
32534 662 IntDen49 48968.96
32535 663 IntDen49 48796.16
32536 664 IntDen49 48156.80
I have 49 Cells and the time reaches 664 for each one of them. In this case time is not important, as I'd like to get the normalized Intensity for each cell (so (Intensity - min)/(max - min)), and possibly adding it as a new column to the dataframe.
I tried
> meltCalcium$normalized <- with(meltCalcium, (Intensity - min(Intensity))/diff(range(Intensity)))
but in this way the max and the min are calculated using the Intensity over all Cells. How can I do it for each cell separately?
Thanks!
Apply the formula by group :
library(dplyr)
result <- meltCalcium %>%
group_by(Cell) %>%
mutate(normalized = (Intensity-min(Intensity))/diff(range(Intensity)))
Base R solution:
normalise_vec_min_max <- function(num_vec){
minnv <- min(num_vec, na.rm = TRUE)
maxnv <- max(num_vec, na.rm = TRUE)
return((num_vec - minnv) / (maxnv - minnv))
}
with(meltCalcium, ave(Intensity, Cell, FUN = normalise_vec_min_max))
Data:
meltCalcium <- structure(list(Time = c(1L, 2L, 3L, 4L, 5L, 6L, 659L, 660L, 661L,
662L, 663L, 664L), Cell = c("IntDen1", "IntDen1", "IntDen1",
"IntDen1", "IntDen1", "IntDen1", "IntDen49", "IntDen49", "IntDen49",
"IntDen49", "IntDen49", "IntDen49"), Intensity = c(306852.5,
302892.2, 298258.6, 300769.9, 301971.8, 302585.6, 47788.16, 47560.32,
47738.24, 48968.96, 48796.16, 48156.8)), row.names = c(NA, -12L
), class = "data.frame")

How to make a list of vectors from a 2 columns data frame in R?

I'm having a hard time trying to transform my data frame to a list of vectors were it's name is the string in the first column in the data frame and the vector itself is the already vector in the second column of my data frame.
I have this data frame:
> head(df)
Intron.ID
1 AT1G79930.2
2 ATCG00720.1
3 AT1G02080.2
4 AT4G32551.2
5 AT5G66190.1
6 AT1G51720.1
Sequence.s.
1 ['GAGGTGCTTGCAAATCGTTCACATCACTGTACTGCACATCAACAGAGAAT']
2 ['GCTTCTTTGTATTTTATGTTTTTAGTCATTATAGCTTTTTTTTTGAATAA', 'TGTTTGAGCTGTACGAGATGAAATTCTCATATACAGTTCTTGGAGGGGGG']
3 ['CTCACCCGGAGTTAGTCACTGTTATTGAACAAGCACTTTCAAGGATATCA']
4 ['AAGTGGTGGTATGTCTCCACAGGTTCAAACTCGAAATCAGCAACTTCCTG']
5 ['AAGGGTTCTTAGGTTTGAATTTGTTGACAACAATCCCTTCTTCCTGTTTC']
6 ['ATTTGGCTTCTCACATAACACTGAAGCTGTGTGACTTGTGTACAATTTTG', 'CTGAGTTAATCTAATAAGCAAGATACATTTTACTTTCGTTTTCCTCTTCC']
And I need this output:
$AT1G79930.2
[1] "GAGGTGCTTGCAAATCGTTCACATCACTGTACTGCACATCAACAGAGAAT"
$ATCG00720.1
[1] "GCTTCTTTGTATTTTATGTTTTTAGTCATTATAGCTTTTTTTTTGAATAA" "TGTTTGAGCTGTACGAGATGAAATTCTCATATACAGTTCTTGGAGGGGGG"
$AT1G02080.2
[1] "CTCACCCGGAGTTAGTCACTGTTATTGAACAAGCACTTTCAAGGATATCA"
$AT4G32551.2
[1] "AAGTGGTGGTATGTCTCCACAGGTTCAAACTCGAAATCAGCAACTTCCTG"
$AT5G66190.1
[1] "AAGGGTTCTTAGGTTTGAATTTGTTGACAACAATCCCTTCTTCCTGTTTC"
$AT1G51720.1
[1] "ATTTGGCTTCTCACATAACACTGAAGCTGTGTGACTTGTGTACAATTTTG" "CTGAGTTAATCTAATAAGCAAGATACATTTTACTTTCGTTTTCCTCTTCC"
The closest I got to that result was with the following command:
> df2 <- split(df, df[1])
> head(df2)
$`AT1G01760.2 `
Intron.ID Sequence.s.
11 AT1G01760.2 ['ACCGGTTGTTCCAAGAATAACTTCGTGTAAGCCAGAATAGTTCCAACACA']
$`AT1G02080.2 `
Intron.ID Sequence.s.
3 AT1G02080.2 ['CTCACCCGGAGTTAGTCACTGTTATTGAACAAGCACTTTCAAGGATATCA']
$`AT1G04430.2 `
Intron.ID Sequence.s.
9 AT1G04430.2 ['CATTATGAACGGCATTGTCCTCCTCCCGAAAGACGGTTTAATTGTTTGAT']
$`AT1G06150.1 `
Intron.ID Sequence.s.
45 AT1G06150.1 ['TGCTAGTGGATCCGTAAGTGCCAAAAATAAATGCCTGATATGAGTCACCA']
$`AT1G17680.3 `
Intron.ID Sequence.s.
48 AT1G17680.3 ['GCAAGCACCAGCTTTCGATATAGCATACTATTACCTTTCACGTGTTTCTG']
$`AT1G18470.2 `
Intron.ID Sequence.s.
81 AT1G18470.2 ['TTCCTTCGTCAATTGACCACCAACCTAATAGCCTGGAACCATGGTGCAAG']
It got it all wrong: the sequences are not in to the right name assignment and some sequences are missing. It is not a good solution...
Extra information as requested (I added the 2 '...'):
> dput(head(df))
structure(list(Intron.ID = structure(c(15L, 80L, 2L, 58L, 79L,
9L), .Label = c("AT1G01760.2 ", "AT1G02080.2 ", "AT1G04430.2 ",
"AT1G06150.1 ", "AT1G17680.3 " ...), class = "factor"), Sequence.s. = structure(c(49L,
59L, 39L, 3L, 2L, 15L), .Label = c(" ['AAACACAAGGGTGGGGTTGACTCTCAAACTCACAAAAAGTTACATTTTCT']",
" ['AAGGGTTCTTAGGTTTGAATTTGTTGACAACAATCCCTTCTTCCTGTTTC']", " ['AAGTGGTGGTATGTCTCCACAGGTTCAAACTCGAAATCAGCAACTTCCTG']",
" ['AATCCATAAAGAAAATGGAGGAGAACATTCAGAATCTGGAAGGTAAGAAC', 'GATTTATGCTTTGGCAACAAAGAGTAGTCATATTCCATACAGGAACTCAA']",
" ['AATTGATCCAGATTGTAGATTAATTGGACTCCATCTGTATGACGGCTTGT']" ...
), class = "factor")), row.names = c(NA, 6L), class = "data.frame")
So, how can I do that transformation without passing the header 'Intron.ID' and 'Sequence.s.' to the vectors and just keep the sequences inside the vector (in the right order and assignment), not including the Intron.ID?
Any help will be very much appreciated!
Thank you all in advance.
Cordially,
Fernanda Costa

R data.table get maximum value per row for multiple columns

I've got a data.table in R which looks like that one:
dat <- structure(list(de = c(1470L, 8511L, 3527L, 2846L, 2652L, 831L
), fr = c(14L, 81L, 36L, 16L, 30L, 6L), it = c(9L, 514L, 73L,
37L, 91L, 2L), ro = c(1L, 14L, 11L, 1L, 9L, 0L)), .Names = c("de",
"fr", "it", "ro"), class = c("data.table", "data.frame"), row.names = c(NA,
-6L))
I now wanna create a new data.table (having exactly the same columns) but holding only the maximum value per row. The values in the other columns should simply be NA.
The data.table could have any number of columns (the data.table above is just an example).
The desired output table would look like this:
de fr it ro
1: 1470 NA NA NA
2: 8511 NA NA NA
3: 3527 NA NA NA
4: 2846 NA NA NA
5: 2652 NA NA NA
6: 831 NA NA NA
There are several issues with what the OP is attempting here: (1) this really looks like a case where data should be kept in a matrix rather than a data.frame or data.table; (2) there's no reason to want this sort of output that I can think of; and (3) doing any standard operations with the output will be a hassle.
With that said...
dat2 = dat
is.na(dat2)[-( 1:nrow(dat) + (max.col(dat)-1)*nrow(dat) )] <- TRUE
# or, as #PierreLafortune suggested
is.na(dat2)[col(dat) != max.col(dat)] <- TRUE
# or using the data.table package
dat2 = dat[rep(NA_integer_, nrow(dat)), ]
mc = max.col(dat)
for (i in seq_along(mc)) set(dat2, i = i, j = mc[i], v = dat[i, mc[i]])
It's not clear to me whether you mean that you want to use the data.table package, or if you are satisfied with making a data.frame using only base functions. It is certainly possible to do the latter.
Here is one solution, which uses only max() and which.max() and relies on the fact that an empty data.frame will fill in all of the remaining cells with NA to achieve a rectangular structure.
maxdat <- data.frame()
for (col in names(dat)) {
maxdat[which.max(dat[,col]), col] <- max(dat[,col])
}

Frequency of items in a list in R

I have a very large csv file. I want to calculate the frequency of the items in the second column in order to graph histogram. An example of my data:
0010,10.1.1.16
0011,10.2.2.10
0012,192.168.2.61
0013,192.168.173.19
0014,10.2.2.10
0015,10.2.2.10
0016,192.168.2.61
I have used the below:
inFile <- read.csv("file.csv")
summary(inFile)
hist(inFile$secondCol)
output of summary:
X0010 X10.1.1.16
Min. :11.00 10.2.2.10 :3
1st Qu.:12.25 192.168.173.19:1
Median :13.50 192.168.2.61 :2
Mean :13.50
3rd Qu.:14.75
Max. :16.00
Because the file is very large, I'm not getting the right histogram. Any suggestions?
Just use table.
DF <- structure(list(V1 = 10:16, V2 = structure(c(1L, 2L, 4L, 3L, 2L,
2L, 4L), .Label = c("10.1.1.16", "10.2.2.10",
"192.168.173.19", "192.168.2.61"), class = "factor")),
.Names = c("V1", "V2"), class = "data.frame",
row.names = c(NA, -7L))
table(DF$V2)
# 10.1.1.16 10.2.2.10 192.168.173.19 192.168.2.61
# 1 3 1 2
If you want a data.frame out of this, you can use as.data.frame:
as.data.frame(table(DF$V2))
# Var1 Freq
# 1 10.1.1.16 1
# 2 10.2.2.10 3
# 3 192.168.173.19 1
# 4 192.168.2.61 2
Since you say you want a histogram, this can be done directly using ggplot2 without having to get the counts first as follows:
require(ggplot2)
ggplot(data = DF, aes(x = V2)) + geom_histogram(aes(y = ..count..))
We could have also done a as.numeric() on the column.
typeof(data$hourofcrime)
# gives me a list
#> typeof(data$hourofcrime)
#[1] "list"
hour_crime_rate <- as.numeric(data$hourofcrime)
hist(hour_crime_rate)

Extracting values from R table within grouped values

I have the following table ordered group by first, second and name.
myData <- structure(list(first = c(120L, 120L, 126L, 126L, 126L, 132L, 132L), second = c(1.33, 1.33, 0.36, 0.37, 0.34, 0.46, 0.53),
Name = structure(c(5L, 5L, 3L, 3L, 4L, 1L, 2L), .Label = c("Benzene",
"Ethene._trichloro-", "Heptene", "Methylamine", "Pentanone"
), class = "factor"), Area = c(699468L, 153744L, 32913L,
4948619L, 83528L, 536339L, 105598L), Sample = structure(c(3L,
2L, 3L, 3L, 3L, 1L, 1L), .Label = c("PO1:1", "PO2:1", "PO4:1"
), class = "factor")), .Names = c("first", "second", "Name",
"Area", "Sample"), class = "data.frame", row.names = c(NA, -7L))
Within each group I want to extract the area that correspond to the specific sample. Several groups don´t have areas from the samples, so if the sample is´nt detected it should return "NA".Ideally, the final output should be a column for each sample.
I have tried the ifelse function to create one column to each sample:
PO1<-ifelse(myData$Sample=="PO1:1",myData$Area, "NA")
However this doesn´t takes into account the group distribution. I want to do this, but within the group. Within each group (a group as equal value for first, second and Name columns) if sample=PO1:1, Area, else NA.
For the first group:
structure(list(first = c(120L, 120L), second = c(1.33, 1.33),
Name = structure(c(1L, 1L), .Label = "Pentanone", class = "factor"),
Area = c(699468L, 153744L), Sample = structure(c(2L, 1L), .Label = c("PO2:1",
"PO4:1"), class = "factor")), .Names = c("first", "second", "Name",
"Area", "Sample"), class = "data.frame", row.names = c(NA, -2L))
The output should be:
structure(list(PO1.1 = NA, PO2.1 = 153744L, PO3.1 = NA, PO4.1 = 699468L), .Names =c("PO1.1", "PO2.1", "PO3.1", "PO4.1"), class = "data.frame", row.names = c(NA, -1L))
Any suggestion?
As in the example in the quesiton, I am assuming Sample is a factor. If this is not the case, consider making it such.
First, lets clean up the column Sample to make it a legal name, or else it might cause errors
levels(myData$Sample) <- make.names(levels(myData$Sample))
## DEFINE THE CUTS##
# Adjust these as necessary
#--------------------------
max.second <- 3 # max & nin range of myData$second
min.second <- 0 #
sprd <- 0.15 # with spread for each group
#--------------------------
# we will cut the myData$second according to intervals, cut(myData$second, intervals)
intervals <- seq(min.second, max.second, sprd*2)
# Next, lets create a group column to split our data frame by
myData$group <- paste(myData$first, cut(myData$second, intervals), myData$Name, sep='-')
groups <- split(myData, myData$group)
samples <- levels(myData$Sample) ## I'm assuming not all samples are present in the example. Manually adjusting with: samples <- sort(c(samples, "PO3.1"))
# Apply over each group, then apply over each sample
myOutput <-
t(sapply(groups, function(g) {
#-------------------------------
# NOTE: If it's possible that within a group there is more than one Area per Sample, then we have to somehow allow for thi. Hence the "paste(...)"
res <- sapply(samples, function(s) paste0(g$Area[g$Sample==s], collapse=" - ")) # allowing for multiple values
unlist(ifelse(res=="", NA, res))
## If there is (or should be) only one Area per Sample, then remove the two lines aboce and uncomment the two below:
# res <- sapply(samples, function(s) g$Area[g$Sample==s]) # <~~ This line will work when only one value per sample
# unlist(ifelse(res==0, NA, res))
#-------------------------------
}))
# Cleanup names
rownames(myOutput) <- paste("Group", 1:nrow(myOutput), sep="-") ## or whichever proper group name
# remove dummy column
myData$group <- NULL
Results
myOutput
PO1.1 PO2.1 PO3.1 PO4.1
Group-1 NA "153744" NA "699468"
Group-2 NA NA NA "32913 - 4948619"
Group-3 NA NA NA "83528"
Group-4 "536339" NA NA NA
Group-5 "105598" NA NA NA
You cannot really expect R to intuit that there is a fourth factor level between PO2 and PO4 , now can you.
> reshape(inp, direction="wide", idvar=c('first','second','Name'), timevar="Sample")
first second Name Area.PO4:1 Area.PO2:1 Area.PO1:1
1 120 1.3 Pentanone 699468 153744 NA
3 126 0.4 Heptene 32913 NA NA
4 126 0.4 Heptene 4948619 NA NA
5 126 0.3 Methylamine 83528 NA NA
6 132 0.5 Benzene NA NA 536339
7 132 0.5 Ethene._trichloro- NA NA 105598

Resources