convert df from factor to numeric - r

I am struggling to convert my dataset into numeric values. The dataset I have looks like this:
customer_id 2012 2013 2013 2014 2015 2016 2017
15251 X N U D S C L
X1 - X7 are marked as factors. The extract from dput(head(df)) is:
structure(list(`2012` = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("N",
"X"), class = "factor"), `2013` = structure(c(6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L
), .Label = c("C", "D", "N", "S", "U", "X"), class = "factor"),
`2014` = structure(c(8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L), .Label = c("C",
"D", "L", "N", "R", "S", "U", "X"), class = "factor"), ...
I would like to have the data in numeric values, but I don't know how I can transform them accordingly.
The goal is that I can feed the df into a heatmap so that I can visually explore the differences. To my knowledge, this is only possible with a numeric matrix. Because I get the error Heatmap.2(input, trace = "none", : `x' must be a numeric matrix
Does someone have any idea?
Many Thanks for your support!

it's do-able. I think it would help next time to include the complete df. The heatmap.2 does not work because you gave it a character matrix. It's a bit more complicated to display the legend for color to letters using heatmap.2, I suggest something below using ggplot
library(ggplot2)
library(dplyr)
library(viridis)
# simulate data
df = data.frame(id=1:5,
replicate(7,sample(LETTERS[1:10],5)))
colnames(df)[-1] = 2012:2018
#convert to long format for plotting and refactor
df <- df %>% pivot_longer(-id) %>%
mutate(value=factor(as.character(value),levels=sort(levels(value))))
#define color scale
# sorted in alphabetical order
present_letters = levels(df$value)
COLS = viridis_pal()(length(present_letters))
names(COLS) = present_letters
#plot
ggplot(data=df,aes(x=name,y=id,fill=value)) +
geom_tile() +
scale_fill_manual(values=COLS)

Related

Why isn't my barplot rearranging properly when faceting with ggplot?

So I have made this barplot with this code, bars organised in descending order, great!
na.omit(insect_tally_native_ranges)%>%
group_by(native_ranges)%>%
dplyr::summarise(freq=sum(n))%>%
ggplot(aes(x=reorder(native_ranges,freq),y=freq))+
geom_col(color="#CD4F39",fill="#CD4F39",alpha=0.8)+
coord_flip()+
labs(x="Native ranges",
y="Number of invasive insect arrivals",
title="Species by native ranges")+
theme_minimal()
And now I wanted to do the same but faceting by a variable called Period, here's the code:
ggplot(native_freq_period,
aes(y=reorder(native_ranges,freq),x=freq))+
geom_barh(stat= "identity",
color="#CD4F39",
fill="#CD4F39",
alpha=0.8)+
labs(x="Native ranges",
y="Number of invasive insect arrivals",
title="Species by native ranges")+
theme_minimal()+
facet_wrap(~Period)
But the plot came out like this:
Which is pretty annoying because it is the same code as above and the levels for the variable native_ranges should be organised again. But instead it gives me this lumpy order that isn't even the alphabetic order. So the reorder part is reordering but not by freq! Don't understand.
Here is the data:
structure(list(native_ranges = structure(c(6L, 10L, 11L, 7L,
3L, 5L, 1L, 1L, 8L, 6L, 3L, 5L, 2L, 4L, 5L, 7L, 7L, 7L, 8L, 9L,
11L), .Label = c("Afrotropic", "Afrotropic/Neotropic", "Australasia",
"Australasia/Neotropic", "Indomalaya", "Nearctic", "Neotropic",
"Neotropic/Nearctic", "Neotropic/Nearctic/Australasia", "Palearctic",
"Palearctic/Indomalaya"), class = "factor"), Period = structure(c(4L,
4L, 4L, 4L, 4L, 4L, 3L, 4L, 4L, 3L, 3L, 3L, 4L, 4L, 2L, 1L, 2L,
3L, 2L, 4L, 3L), .Label = c("1896-1925", "1926-1955", "1956-1985",
"1986-2018"), class = "factor"), freq = c(21L, 13L, 12L, 11L,
10L, 10L, 4L, 4L, 4L, 3L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L)), row.names = c(NA, -21L), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"), vars = "native_ranges", drop = TRUE, indices = list(
6:7, 12L, c(4L, 10L), 13L, c(5L, 11L, 14L), c(0L, 9L), c(3L,
15L, 16L, 17L), c(8L, 18L), 19L, 1L, c(2L, 20L)), group_sizes = c(2L,
1L, 2L, 1L, 3L, 2L, 4L, 2L, 1L, 1L, 2L), biggest_group_size = 4L, labels = structure(list(
native_ranges = structure(1:11, .Label = c("Afrotropic",
"Afrotropic/Neotropic", "Australasia", "Australasia/Neotropic",
"Indomalaya", "Nearctic", "Neotropic", "Neotropic/Nearctic",
"Neotropic/Nearctic/Australasia", "Palearctic", "Palearctic/Indomalaya"
), class = "factor")), row.names = c(NA, -11L), class = "data.frame", vars = "native_ranges", drop = TRUE))
You have to arrange the order of the variable first before plotting. Since you didn't provide any reproducible data I am using the following data
drugs <- data.frame(drug = c("a", "b", "c"), effect = c(4.2, 9.7, 6.1))
ggplot(drugs, aes(drug, effect)) +
geom_col()
Now to change the order of the variable use factor
drugs$drug <- factor(drugs$drug,levels = c("b","a","c")) #This is the order I want
ggplot(drugs, aes(drug, effect)) +
geom_col()
Here I provided the levels in factor manually. You can either provide them manually or sort the order of the variable first separately and provide. See below,
drugs$drug <- factor(drugs$drug,levels = drugs[order(drugs$effect),]$drug)
ggplot(drugs, aes(drug, effect)) +
geom_col()
This should work with facet_wrap as well.
OK, finally figured it out with help from the other answer. You need to create another column that summarizes the total frequency so you can then reorder by that column. There may be a more efficient way to do it, but I create a new summary data.frame and then join it back to the original and then reorder based on the new column.
summary_data <- data %>%
ungroup() %>%
group_by(native_ranges) %>%
summarize(total = sum(freq))
data <- data %>%
left_join(summary_data)
ggplot(data, aes(y = reorder(native_ranges, total),x = freq)) +
geom_barh(stat= "identity",
color="#CD4F39",
fill="#CD4F39",
alpha=0.8) +
labs(x="Native ranges",
y="Number of invasive insect arrivals",
title="Species by native ranges") +
theme_minimal()+
facet_wrap(~Period)

Suppressing data from a graph in R

I have a dataset, d, that contains personally identifiable data, I have the dataset putting an X for all values that are suppressed:
column1 column2 column3
* FSM X
* Male 2.5
* Female X
A FSM 6
A Male 10.3
A Female 11.7
B FSM 14.8
B Male 21.5
B Female 25.3
I want to plot this with an X above the bars in a bar plot, where data has been suppressed, such as:
My code is:
p <- ggplot(d, aes(x=column1, y=column3, fill=column2)) +
geom_bar(position=position_dodge(), stat="identity", colour="black") +
geom_text(aes(label=column2),position= position_dodge(width=0.9), vjust=-.5)
scale_y_continuous("Percentage",breaks=seq(0, max(d$column3), 2)))
But of course, it can't plot 'X' on the graph and says:
Error: Discrete value supplied to continuous scale
How can I get the bar plotting to ignore the 'X' and still add the label if it's present?
Data dump:
structure(list(column1 = structure(c(1L, 1L, 1L, 2L, 2L, 2L,
3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L), .Label = c("*",
"A", "B", "C", "D", "E", "U"), class = "factor"), column2 = structure(c(1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L,
3L, 1L, 2L, 3L), .Label = c("FSM", "Male", "Female"), class = "factor"),
column3 = structure(c(21L, 1L, 2L, 18L, 3L, 4L, 7L, 12L,
14L, 16L, 15L, 13L, 10L, 9L, 8L, 11L, 6L, 5L, 20L, 19L, 17L
), .Label = c("1.93889541715629", "1.97444831591173", "10.1057579318449",
"11.7305458768873", "12.7758420441347", "14.4535840188014",
"14.8471615720524", "18.5830429732869", "19.9764982373678",
"20.0873362445415", "20.9606986899563", "21.5628672150411",
"24.1579558652729", "25.3193960511034", "25.7931844888367",
"29.2576419213974", "5.45876887340302", "6.11353711790393",
"6.16921269095182", "6.98689956331878", "X"), class = "factor")), .Names = c("column1",
"column2", "column3"), row.names = c(NA, -21L), class = "data.frame")
I 'm happy to print out 0 instances where there are 0 instances, but in the case of data suppression, I want to make it clear that data has been suppressed by printing out a 'X', but the bar will also show 0 instances
First convert the height to numeric which gives NA for censored values. Then create a label column based on that. Then you need a column of zeroes for the y coordinate of the labels.
> d$column3=as.numeric(as.character(d$column3))
Warning message:
NAs introduced by coercion
> d$column4 = ifelse(is.na(d$column3),"X","")
> d$y=0
Then:
> p <- ggplot(d, aes(x=column1, y=column3, fill=column2))
> p + geom_bar(position=position_dodge(), stat="identity",
colour="black") +
geom_text(aes(label=column4,x=column1,y=y),
position=position_dodge(width=1), vjust=-0.5)
Giving:
Its a variant on labelling a geom_bar with the value of the bar. Almost a dupe.

assign multiple color to each vertex in igraph

I have a dataframe d:
d<-structure(list(V1 = c(1L, 3L, 3L, 2L, 1L, 1L, 7L, 9L, 10L, 9L, 7L), V2 = c(2L, 4L, 5L, 5L, 4L, 6L, 8L, 3L, 1L, 8L, 5L)),
.Names = c("V1", "V2"), class ="data.frame", row.names = c(NA, -11L))
g<-graph.data.frame(d,directed = F)
I would assign to each vertex one or more colors depending on its affiliation variable given in a dataframe m
m<-structure(list(vertex = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 9L, 9L, 10L, 1L, 1L, 6L, 6L), affilation = c(1L, 1L, 1L, 2L, 2L, 1L, 3L, 3L, 3L, 2L, 2L, 3L, 3L, 2L, 2L, 3L)),
.Names = c("vertex", "affilation"), class = "data.frame", row.names = c(NA, -16L))
i would to ask if there is a simple method in igraph package to assign one or multiple color to each vertex according to its affiliation
(EDIT) Because the edges were specified as integers the vertices were not in order in the graph. I changed the initial graph.data.frame call to specify the order, then everything should work.
g<-graph.data.frame(d,directed = F, vertices = 1:10)
You can assign color as a vertex attribute by assigning into V(g)$color. Here's some code for your example (only for a single affiliation)
# Put in vertex order
m <- m[order(m$vertex), ]
# Create a palette (any palette would do)
pal <- rainbow(n=length(unique(m$affilation)))
# Get each node's first affiliation
oneAffil <- m$affilation[!duplicated(m$vertex)]
# Assign to the vertices as a color attribute
V(g)$color <- pal[oneAffil]
plot(g)
Now for multiple affiliations it's not too clear what you want. You could look at vertex.shape.pie that can draw shapes with more than one color on. Something like this works for me (but there's quite a bit of data wrangling to get it going)
# Use a cast to split out affiliation in each group as a column
library(reshape2)
am <- acast(m, formula = vertex ~ affilation)
# Turn this into a list
values <- lapply(seq_len(nrow(am)), function(i) am[i,])
plot(g, vertex.shape="pie", vertex.pie=values, vertex.pie.color=list(pal))

How to create 5x1 matrix geom_bar facet and allowing negative value in bar to go down y-axis in ggplot?

The following figure:
Was generated with the following code:
library(ggplot2)
library(reshape2)
dat <- structure(list(Type = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L), .Label = c("High_expression",
"KD.ip", "LG.ip", "LN.id", "LV.id", "LV.ip", "SP.id", "SP.ip"
), class = "factor"), ImmGen = structure(c(1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L), .Label = c("Bcells",
"DendriticCells", "Macrophages", "Monocytes", "NKCells", "Neutrophils",
"StemCells", "StromalCells", "abTcells", "gdTCells"), class = "factor"),
Exp_06hr = c(7174.40482999999, 23058.74882, 39819.39133,
15846.46146, 8075.78226, 105239.11609, 7606.34563, 19513.57747,
7116.51211, 6978.64995, 498.36828, 732.01788, 621.51576,
546.63461, 529.1711, 545.17219, 477.54658, 1170.50303, 550.99528,
607.56707, 775.0691, 1269.50773, 2138.69883, 1561.74652,
601.9372, 5515.59896, 744.48716, 997.32859, 639.13126, 657.64581,
4165.29899, 5465.1883, 7773.25723, 5544.86758, 3461.13442,
8780.64899, 4380.00437, 8721.84871, 3674.62723, 3911.00108,
2932.76554, 5903.48407, 6179.81046, 3683.64539, 2744.59622,
6760.37307, 4097.14665, 6845.31988, 2872.77771, 2912.84262
), Exp_24hr = c(1596.9091, 4242.52354, 9984.68861, 3519.18627,
1602.92511, 12203.57109, 1656.19357, 3389.93866, 1617.35484,
1579.00309, 715.47289, 643.98371, 689.40412, 580.26036, 608.22853,
695.10737, 830.77947, 670.34899, 640.67908, 637.47464, 356.75713,
393.13449, 549.60095, 466.76064, 336.95453, 617.20976, 339.2476,
469.57407, 292.86365, 305.45178, 2604.07605, 4210.64843,
5797.13123, 3650.88447, 2275.03269, 6475.27485, 2604.70614,
4796.3314, 2411.09694, 2458.23237, 1498.21516, 1996.6875,
2927.82836, 1911.00463, 1523.57171, 2199.62297, 1541.82034,
2815.82184, 1608.46099, 1588.80561), ExpDiff_06_24hr = c(5577.49572999999,
18816.22528, 29834.70272, 12327.27519, 6472.85715, 93035.545,
5950.15206, 16123.63881, 5499.15727, 5399.64686, -217.10461,
88.03417, -67.88836, -33.62575, -79.05743, -149.93518, -353.23289,
500.15404, -89.6838, -29.9075700000001, 418.31197, 876.37324,
1589.09788, 1094.98588, 264.98267, 4898.3892, 405.23956,
527.75452, 346.26761, 352.19403, 1561.22294, 1254.53987,
1976.126, 1893.98311, 1186.10173, 2305.37414, 1775.29823,
3925.51731, 1263.53029, 1452.76871, 1434.55038, 3906.79657,
3251.9821, 1772.64076, 1221.02451, 4560.7501, 2555.32631,
4029.49804, 1264.31672, 1324.03701)), .Names = c("Type",
"ImmGen", "Exp_06hr", "Exp_24hr", "ExpDiff_06_24hr"), row.names = c(NA,
-50L), class = "data.frame")
dat.m <- melt(dat)
setwd("~/Desktop/")
pdf("myfig.pdf",width=30,height=20)
p <- ggplot(dat.m,aes(ImmGen,value)) +
geom_bar(aes(fill = variable),position = "dodge",stat="identity")+
facet_wrap(~Type)
p
dev.off();
How can I modify it such that instead of wrapping it to (2x3) matrix like the above, we create (5x1) matrix instead. So each row will have its on scale of y-axis.
Secondly notice that the blue-bar (ExpDiff_06_24hr) can contain negative value. How can I show that so that in the plot the bar goes below 0 in y-axis.
I think the subplots shouldn't be plotted in one row but in one column for clarity reasons. Whith some help of this answer (thanks to hrbrmstr for linking to it) and because I think this question deserves an answer, here is a solution:
dat$rank <- rank(dat$ExpDiff_06_24hr)
dat.m <- melt(dat, id = c("Type","ImmGen","rank"))
dat.t <- transform(dat.m, TyIm = factor(paste0(Type, ImmGen)))
dat.t <- transform(dat.t, TyIm = reorder(TyIm, rank(rank)))
p <- ggplot(dat.t, aes(TyIm,value)) +
geom_bar(aes(fill = variable), position = "dodge", stat="identity")+
facet_wrap(~Type, ncol=1, scales="free") +
scale_x_discrete("ImmGen", breaks=dat.t$TyIm, labels=dat.t$ImmGen)
p
The result:

How to modify slots lme4 >1.0

I have been using the code below to successfully modify the 'Zt', 'L', and 'A' slots of models fit using lme4 versions <1.0. I just updated to lme4 1.0-4 today and found that the model objects are different. Can anyone provide insight/guidance as to how to modify these slots in the new lmer model objects?
dat<-structure(list(pop1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 10L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L,
4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 7L, 7L,
7L, 8L, 8L, 9L), pop2 = c(2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
5L, 6L, 7L, 8L, 9L, 10L, 6L, 7L, 8L, 9L, 10L, 7L, 8L, 9L, 10L,
8L, 9L, 10L, 9L, 10L, 10L), X = c(0.49136, 0.75587, 0.93952,
0.61278, 0.79934, 1.07918, 1.13354, 1.15836, 1.2014, 0.43136,
0.77815, 0.716, 0.93952, 1.13672, 1.16137, 1.18184, 1.21748,
0.65321, 0.86332, 1.04922, 1.19866, 1.20412, 1.22272, 1.24797,
0.89763, 1.08991, 1.19033, 1.15836, 1.17319, 1.18752, 0.64345,
0.93952, 0.98227, 1.01703, 1.07188, 0.78533, 0.94939, 0.99564,
1.06819, 0.64345, 0.716, 0.85126, -0.04576, 0.4624, 0.30103),
Y = c(0.491694, 0.394703, 0.113303, 0.156597, 0.450924, 0.487845,
0.21821, 0.129027, -0.131522, 0.35156, -0.116826, 0.18941,
0.306608, 0.258401, 0.008552, -0.024369, -0.305258, -0.013628,
0.215715, 0.13783, 0.467272, 0.088882, 0.084295, -0.172337,
-0.206725, -0.084339, -0.191651, -0.001586, -0.079501, -0.195094,
0.232045, 0.17102, 0.003742, -0.023688, -0.26085, 0.205326,
0.172809, 0.133219, -0.159054, 0.082231, 0.011025, -0.238611,
0.732679, 0.478058, 0.325698)), .Names = c("pop1", "pop2",
"X", "Y"), class = "data.frame", row.names = c(NA, -45L))
library(lme4) # lme4 versions >1.0 have different model output
# Specify the model formula
lmer_mod <- as.formula("Y ~ X + (1|pop1)")
# Create the Zl and ZZ matrices
Zl <- lapply(c("pop1","pop2"), function(nm) Matrix:::fac2sparse(dat[[nm]], "d", drop=FALSE))
ZZ <- Reduce("+", Zl[-1], Zl[[1]])
# Fit lmer model to the data
mod <- lmer(lmer_mod, data = dat, REML = TRUE)
# Replace the following slots in the fitted model
# These slots don't exist in this form in the new lmerMod objects
mod#Zt <- ZZ
mod#A <- ZZ
mod#L <- Cholesky(tcrossprod(ZZ), LDL=FALSE, Imult=1)
# Refit the model to the same response data
Final.mod <- refit(mod, dat[,Y])
Any help or insight as to how to modify these slots will be greatly appreciated. In the meantime, I guess I will stick to using an older version of lme4 for these models.
Does this do what you want? (This follows ?modular pretty closely ...)
Create the Zl and ZZ matrices:
Zl <- lapply(c("pop1","pop2"),
function(nm) Matrix:::fac2sparse(dat[[nm]], "d", drop=FALSE))
ZZ <- Reduce("+", Zl[-1], Zl[[1]])
Construct the random-effects data structures:
lf <- lFormula(Y ~ X + (1|pop1), data=dat)
Modify them:
lf$reTrms$Zt <- ZZ
Proceed through the remaining model-construction and fitting steps:
dfun <- do.call(mkLmerDevfun,lf) ## create dev fun from modified lf
opt <- optimizeLmer(dfun) ## fit the model
## make the results into a 'merMod' object
fit <- mkMerMod(environment(dfun), opt, lf$reTrms,
fr = lf$fr)

Resources