Cumulative Frequency Graph in R - r

I've been trying to look around to see if other questions helped - they didn't.
I've imported my data as follows:
Data <- read.csv("Module 1.csv")
Data Output:
structure(list(ID = 1:50, Data_Points = c(41L, 42L, 43L, 44L,
45L, 45L, 45L, 46L, 47L, 48L, 48L, 49L, 50L, 50L, 52L, 53L, 54L,
55L, 55L, 57L, 57L, 57L, 58L, 58L, 58L, 59L, 60L, 62L, 62L, 63L,
65L, 67L, 68L, 69L, 70L, 71L, 71L, 72L, 73L, 75L, 75L, 77L, 82L,
83L, 83L, 85L, 85L, 86L, 87L, 89L), LCL = c(40L, 48L, 56L, 64L,
72L, 80L, 88L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
UCL = c(47L, 55L, 63L, 71L, 79L, 87L, 95L, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA), LCB = c(39.5, 47.5,
55.5, 63.5, 71.5, 79.5, 87.5, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), UCB = c(47.5, 55.5, 63.5, 71.5,
79.5, 87.5, 95.5, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA), MP = c(43.5, 51.5, 59.5, 67.5, 75.5, 83.5, 91.5,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Frequency = c(9L,
10L, 11L, 7L, 5L, 7L, 1L, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA), Cumulative_Frequency = c(9L, 19L, 30L,
37L, 42L, 49L, 50L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA)), class = "data.frame", row.names = c(NA, -50L
))
I do not know why R placed some the letter "L" after many of the numbers.
Please ignore the any "L" that you see.
Code for Cumulative Frequency Version 1:
So, I have done this in two ways, both of which are wrong.
The first code I tried is as follows:
ggplot(Data, aes(x = Data_Points, y = cumsum(Data_Points))) +
geom_line() +
geom_point() +
labs(x = "Data Points",
y = "Frequency",
title = "Cumulative Frequency Polygon of the Data Provided") +
scale_x_continuous(breaks = seq(39.5, 95.5, by = 8)) +
scale_y_continuous(breaks = c(0:12)) +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(axis.title.y.right = element_text(margin = margin(0, 0, 0, 10)))
It looks something like this:
Weird, not a smooth line
The line doesn't show each individual frequency in between each class whatsoever.
Code for Cumulative Frequency Version 2:
I've also tried using geom_line() and geom_point() to see if that helps (newsflash! It does not).
I wrote something like this for the code (I changed it multiple times at this point, with no luck).
ggplot(Data, aes(x = Data_Points, y = Cumulative_Frequency)) +
geom_line() +
geom_point() +
labs(x = "Data Points",
y = "Frequency",
title = "Cumulative Frequency Polygon of the Data Provided") +
scale_x_continuous(breaks = seq(39.5, 95.5, by = 8)) +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(axis.title.y.right = element_text(margin = margin(0, 0, 0, 10)))
Here is what this looks like:
What the heck?
Any help is much appreciated.

The usual way to show an empirical cumulative density function from a particular data set would be to use stat_ecdf:
library(ggplot2)
ggplot(Data, aes(x = Data_Points)) +
stat_ecdf() +
labs(x = "Data Points",
y = "Cumulative density",
title = "") +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(axis.title.y.right = element_text(margin = margin(0, 0, 0, 10)))
The steps are to be expected from the given density of the data points and are quite normal. However, if you want a smoother version you could create a cumulative frequency polygon like this:
ggplot(as.data.frame(table(Data$Data_Points)),
aes(x = as.numeric(as.character(Var1)), y = cumsum(Freq)/sum(Freq))) +
geom_line() +
labs(x = "Data Points",
y = "Frequency",
title = "Cumulative Frequency Polygon of the Data Provided") +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(axis.title.y.right = element_text(margin = margin(0, 0, 0, 10)))
Created on 2022-06-08 by the reprex package (v2.0.1)

Related

Geom_rect() removed after log2 transformation

(I am aware of a similar issue discussed here but not able to get these solutions to work)
I used this code to successfully create this plot with a linear scale on the y axis. The important bit is the geom_rect() that creates the blue lines that later disappear when I transform the y axis to a log2 scale.
plot <- ggplot(data, aes(x=ï.., y=Casirivimab)) + geom_point(size = 1)
#plot <- plot + scale_y_continuous(trans='log2', breaks = trans_breaks("log2", function(x) 2^x),labels = trans_format("log2", math_format(2^.x)))
plot <- plot + geom_rect(aes(xmin=ï..-.1, xmax=ï..+.1, ymin=-Inf, ymax=Inf, fill = Cas.Epitope), alpha=0.5, stat="identity")
plot <- plot + scale_fill_manual(values = c("x"="blue"), na.value= "grey92")
plot <- plot + coord_cartesian(xlim = c(340,510))
plot <- plot + theme(panel.background = element_rect(colour = "grey92", fill = "grey92"))
plot <- plot + labs(x = "amino acid position", y = "fold change", title = "Casirivimab", color = "Epitope Position")
plot_Cas <- plot
plot_Cas
This was the resultant plot
Once I add in the log 2 transformation the log2 transformation the blue lines created by geom_rect dissappear. How do I prevent these blue lines disappearing when I transform the axis to a log2 scale?
plot <- ggplot(data, aes(x=ï.., y=Casirivimab)) + geom_point(size = 1)
**plot <- plot + scale_y_continuous(trans='log2', breaks = trans_breaks("log2", function(x) 2^x),labels = trans_format("log2", math_format(2^.x)))**
plot <- plot + geom_rect(aes(xmin=ï..-.1, xmax=ï..+.1, ymin=-Inf, ymax=Inf, fill = Cas.Epitope), alpha=0.5, stat="identity")
plot <- plot + scale_fill_manual(values = c("x"="blue"), na.value= "grey92")
plot <- plot + coord_cartesian(xlim = c(340,510))
plot <- plot + theme(panel.background = element_rect(colour = "grey92", fill = "grey92"))
plot <- plot + labs(x = "amino acid position", y = "fold change", title = "Casirivimab", color = "Epitope Position")
plot_Cas <- plot
plot_Cas
The is the resulting plot:
Many thanks for your help in advance.
P.s. here is the data set from which the data plotted on the graphs is taken. The plotted data is for "Casirivimab" only.
structure(list(ï.. = c(18L, 69L, 80L, 141L, 215L, 222L, 241L,
246L, 247L, 321L, 333L, 334L, 335L, 337L, 339L, 340L, 341L, 343L,
344L, 345L, 346L, 348L, 351L, 354L, 357L, 359L, 360L, 361L, 367L,
378L, 384L, 403L, 405L, 406L, 408L, 409L, 415L, 416L, 417L, 417L,
417L, 420L, 421L, 435L, 439L, 440L, 441L, 444L, 444L, 445L, 446L,
447L, 448L, 449L, 450L, 452L, 452L, 453L, 455L, 456L, 456L, 456L,
457L, 458L, 460L, 470L, 472L, 473L, 474L, 475L, 475L, 476L, 477L,
478L, 478L, 479L, 481L, 482L, 483L, 484L, 484L, 485L, 486L, 486L,
486L, 486L, 486L, 487L, 488L, 489L, 490L, 490L, 492L, 493L, 494L,
494L, 495L, 496L, 498L, 500L, 501L, 502L, 503L, 504L, 505L, 508L,
509L, 519L, 537L, 570L, 583L, 655L, 681L, 681L, 692L, 701L, 716L,
859L, 982L, 1118L, 1147L, 1163L, 1229L), Mutation = c("L18F",
"Δ69-70", "D80A", "Δ141-146", "D215G", "A222V", "Δ242-247",
"R246I", "S247R", "Q321L", NA, NA, NA, NA, NA, NA, "V341I", NA,
NA, NA, NA, "A348T", NA, "N354D", NA, "S359N", NA, NA, "V367F",
"K378R", "P384L", NA, NA, "E406W", "R408I", "Q409E", NA, NA,
"K417E", "K417N", "K417T", NA, NA, "A435S", "N439K", "N440K",
NA, "K444Q", "K444T", "V445A", "G446V", NA, NA, "Y449N", "N450D",
"L452Q", "L452R", "Y453F", "L455F", "F456A", "F456K", "F456R",
NA, "K458R", "N460T", NA, "I472V", NA, NA, "A475R", "A475V",
"G476S", "D.2 (S477N)", "T478I", "T478K", "P479S", NA, NA, "V483A",
"E484K", "E484Q", "G485D", "F486K", "F486L", "F486R", "F486S",
"F486V", "N487R", NA, NA, "F490L", "F490S", NA, "Q493K", "S494P",
"S494R", NA, NA, NA, NA, "N501Y", NA, NA, NA, NA, "Y508H", NA,
"H519P", "K537R", "A570D", "E583D", "H655Y", "P681H", "P681R",
"I692V", "A701V", "T716I", "T859N", "S982A", "D1118H", "S1147L",
"D1163Y", "M1229I"), Cas.Epitope = c(NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, "x", NA, NA, NA, NA, NA,
NA, NA, "x", "x", NA, "x", NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, "x", "x", "x", NA, NA, NA, NA, NA, NA, NA,
NA, NA, "x", "x", "x", NA, NA, NA, NA, NA, NA, NA, "x", "x",
"x", "x", "x", "x", "x", "x", "x", "x", "x", NA, NA, NA, "x",
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Casirivimab = c(0.766666667,
1.2, 0.5, 0.9, 1.2, 1.25, 0.866666667, 0.7, NA, 1.6, NA, NA,
NA, NA, NA, NA, 0.8, NA, NA, NA, NA, 1, NA, 1.4, NA, 0.5, NA,
NA, 0.6, 0.8, 1.9, NA, NA, 84.4, 0.5, 1.3, NA, NA, 62.8, 34.8,
7.1, NA, NA, 1.1, 1.22, 1, NA, 0.95, 2, 1.75, 0.4, NA, NA, NA,
1.4, 5.8, 3.966666667, 73.91428571, 89.95, NA, NA, 3, NA, 0.8,
NA, NA, 2.2, NA, NA, 44.4, NA, 3.3, 2.45, 0.866666667, 2.9, 0.9,
NA, NA, 0.4, 18.84166667, 26.5, 3.7, 100, 48.6, 100, 100, 75.2,
100, NA, NA, 2.15, 0.866666667, NA, 42.66666667, 2.966666667,
0.8, NA, NA, NA, NA, 0.9, NA, NA, NA, NA, 1.1, NA, 0.7, 1, 0.433333333,
1, 1.2, 0.833333333, 1.2, 0.3, 0.7, 0.366666667, 1.3, 1, 0.9,
2.1, 0.8, 1.8), Bam.Epitope = c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "x",
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "x",
"x", NA, "x", NA, "x", "x", "x", "x", NA, NA, NA, NA, "x", NA,
NA, NA, NA, NA, NA, NA, NA, NA, "x", "x", "x", "x", "x", "x",
"x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x",
"x", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Bamlanivimab = c(NA,
0.5, NA, NA, NA, 0.5, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, 0.4, NA, NA, NA, NA, 1.3, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, 100, 1.55, NA, NA, NA, 4.2,
NA, NA, NA, NA, NA, NA, NA, 2.5, NA, NA, 1.1, 2.6, NA, NA, NA,
NA, NA, 100, 100, NA, NA, NA, 100, NA, NA, 0.9, NA, NA, 100,
100, NA, NA, 100, 100, NA, NA, NA, NA, 1.1, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA), Ete.Epitope = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "x", NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, "x", "x", "x", "x", "x", "x", "x", NA,
"x", "x", "x", "x", NA, NA, NA, NA, "x", "x", "x", "x", "x",
NA, "x", "x", "x", "x", NA, "x", "x", "x", "x", "x", "x", "x",
NA, NA, "x", "x", "x", "x", "x", "x", NA, NA, NA, NA, NA, NA,
"x", "x", NA, "x", "x", "x", "x", "x", "x", NA, "x", NA, NA,
NA, "x", "x", "x", "x", NA, NA, NA, "x", "x", NA, "x", "x", NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA), Etesevimab = c(0.6, 0.65, NA, 0.6, NA, 1.2, 0.2, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 83, 49,
NA, NA, NA, 0.35, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
1.4, 1.4, NA, 14.9, 32.6, 100, NA, NA, 100, NA, NA, NA, NA, 100,
16.7, NA, 0.6, 0.6, NA, NA, NA, NA, NA, 3.125, 1.3, NA, NA, 2.9,
17.9, NA, NA, 100, NA, NA, 4.5, 0.8, NA, NA, 0.55, 3.3, NA, NA,
NA, NA, 2.25, NA, NA, NA, NA, NA, NA, NA, NA, 0.2, 0.3, 0.5,
0.5, NA, NA, NA, 0.4, NA, 0.8, 0.5, NA, 0.7, NA), Imde.Epitope = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, "x", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "x", "x", "x",
"x", "x", "x", "x", "x", "x", "x", "x", "x", NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, "x", "x", NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), Imdevimab = c(0.7, 0.65, 0.6, 0.95, 0.7, 0.675, 0.433333333,
0.7, NA, 1.2, NA, NA, NA, NA, NA, NA, 0.8, NA, NA, NA, NA, 0.7,
NA, 0.7, NA, 0.6, NA, NA, 0.4, 0.6, 1.3, NA, NA, 100, 0.4, 1,
NA, NA, 1.166666667, 0.566666667, 1.1, NA, NA, 1, 24.66, 95.6,
NA, 68, 100, 68.26666667, 58.35, NA, NA, NA, 18.6, 4.4, 3.15,
1.5625, 1.2, NA, NA, 0.3, NA, 0.4, NA, NA, 1, NA, NA, 0.3, NA,
0.5, 1.375, 0.933333333, 1.4, 1.6, NA, NA, 0.6, 2.169230769,
1.1, 2.2, 0.4, 0.8, 0.3, 0.4, 2.05, 0.1, NA, NA, 2.933333333,
2.35, NA, 2.3, 1.233333333, 3.2, NA, NA, NA, NA, 0.633333333,
NA, NA, NA, NA, 0.7, NA, 0.5, 1.2, 0.466666667, 1, 1.3, 0.766666667,
0.9, 0.6, 0.8, 0.433333333, 1.2, 1.233333333, 0.733333333, 0.9,
1, 0.3), Sotro.Epitope = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", NA,
NA, "x", "x", "x", "x", "x", NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "x", NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, "x", NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA), Sotrovimab = c("1.1", "0.8",
NA, "0.8", NA, "0.8", "0.5", NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, "0.7", "1", NA, NA, NA, "1.05",
NA, NA, NA, NA, NA, NA, NA, NA, "#DIV/0!", NA, NA, NA, "1", NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "#DIV/0!", "0.5",
"0.7", NA, NA, NA, NA, NA, "0.45", NA, NA, NA, "1.5", NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, "0.8", NA, NA, NA, NA, NA, "2.35",
NA, NA, NA, NA, NA, NA, NA, NA, "1.8", "1.1", "0.8", "0.7", NA,
NA, NA, "1", NA, "0.7", "0.5", NA, "#DIV/0!", NA), Regdan.Epitope = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "x",
NA, NA, NA, NA, NA, NA, "x", "x", "x", NA, NA, NA, NA, NA, NA,
NA, NA, NA, "x", NA, NA, "x", "x", "x", "x", "x", "x", "x", "x",
"x", NA, NA, NA, "x", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, "x", NA, NA, NA, "x", "x", "x", "x", "x", NA, NA, "x",
"x", "x", "x", "x", "x", "x", "x", "x", "x", NA, NA, NA, NA,
NA, "x", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA), Regdanivimab = c(NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, 0.7, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, 35, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, 1, NA, NA, NA, NA, 8.7, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 5.5,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA)), class = "data.frame", row.names = c(NA,
-123L))

How to plot a cumulative frequency line graph using ggplot2?

Forgive me if this question is self explanatory, but I am still trying to get to grips with some more of R's features.
I am currently trying to use R to replot a cumulative frequency with lines I plotted in excel.
I think a lot of my problems are coming from having a lot of cells with no data, as I keep getting the warning:
Warning messages:
1: Removed 81 row(s) containing missing values (geom_path).
2: Removed 81 row(s) containing missing values (geom_path).
3: Removed 81 row(s) containing missing values (geom_path).
This is because each column represents a recording frequency witch witch only occurred for 21 days, with a 20 day rest period between each recording period.
I have tried using geom_ steps() and geom_points() but I end up with these:
When I use the geom_line() function the axis are created but nothing is plotted.
The dates on the x axis also look horrendous, I tried using the code + theme(axis.text.x = element_text(angle = 90)) to rotate the labels but it still looks terrible, I am not sure if its just to many dates.
Here is the code I have been trying to get to work for the various geom functions:
ggplot() +
geom_point(aes(x = Date, y = d2s1, group = 1), data = cf) +
geom_point(aes(x = Date, y = d20s1, group = 1), data = cf) +
geom_point(aes(x = Date, y = d10s1, group = 1), data = cf) +
theme(axis.text.x = element_text(angle = 90))
ggplot() +
geom_step(aes(x = Date, y = d2s1, group = 1), data = cf) +
geom_step(aes(x = Date, y = d20s1, group = 1), data = cf) +
geom_step(aes(x = Date, y = d10s1, group = 1), data = cf) +
theme(axis.text.x = element_text(angle = 90))
ggplot() +
geom_line(aes(x = Date, y = d2s1, group = 1), data = cf) +
geom_line(aes(x = Date, y = d20s1, group = 1), data = cf) +
geom_line(aes(x = Date, y = d10s1, group = 1), data = cf) +
theme(axis.text.x = element_text(angle = 90))
I hope this all makes sense and thank you all in advance for any help you can provide!
I read in the data using read.csv("cf.csv").
I have attached the output of dput(cf) below.
structure(list(Date = c("08/11/2019", "09/11/2019", "10/11/2019",
"11/11/2019", "12/11/2019", "13/11/2019", "14/11/2019", "15/11/2019",
"16/11/2019", "17/11/2019", "18/11/2019", "19/11/2019", "20/11/2019",
"21/11/2019", "22/11/2019", "23/11/2019", "24/11/2019", "25/11/2019",
"26/11/2019", "27/11/2019", "28/11/2019", "29/11/2019", "30/11/2019",
"01/12/2019", "02/12/2019", "03/12/2019", "04/12/2019", "05/12/2019",
"06/12/2019", "07/12/2019", "08/12/2019", "09/12/2019", "10/12/2019",
"11/12/2019", "12/12/2019", "13/12/2019", "14/12/2019", "15/12/2019",
"16/12/2019", "17/12/2019", "18/12/2019", "19/12/2019", "20/12/2019",
"21/12/2019", "22/12/2019", "23/12/2019", "24/12/2019", "25/12/2019",
"26/12/2019", "27/12/2019", "28/12/2019", "29/12/2019", "30/12/2019",
"31/12/2019", "01/01/2020", "02/01/2020", "03/01/2020", "04/01/2020",
"05/01/2020", "06/01/2020", "07/01/2020", "08/01/2020", "09/01/2020",
"10/01/2020", "11/01/2020", "12/01/2020", "13/01/2020", "14/01/2020",
"15/01/2020", "16/01/2020", "17/01/2020", "18/01/2020", "19/01/2020",
"20/01/2020", "21/01/2020", "22/01/2020", "23/01/2020", "24/01/2020",
"25/01/2020", "26/01/2020", "27/01/2020", "28/01/2020", "29/01/2020",
"30/01/2020", "31/01/2020", "01/02/2020", "02/02/2020", "03/02/2020",
"04/02/2020", "05/02/2020", "06/02/2020", "07/02/2020", "08/02/2020",
"09/02/2020", "10/02/2020", "11/02/2020", "12/02/2020", "13/02/2020",
"14/02/2020", "15/02/2020", "16/02/2020", "17/02/2020"), d2s1 = c(6L,
11L, 13L, 20L, 25L, 35L, 42L, 49L, 49L, 51L, 53L, 54L, 60L, 65L,
69L, 73L, 76L, 80L, 85L, 86L, 86L, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), d10s2 = c(0L, 6L, 8L,
10L, 11L, 14L, 14L, 15L, 18L, 19L, 21L, 21L, 22L, 22L, 24L, 24L,
26L, 27L, 31L, 32L, 32L, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), d20s1 = c(NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, 3L, 9L, 13L, 19L, 24L, 26L, 32L, 38L, 44L, 46L, 48L,
50L, 56L, 62L, 64L, 64L, 73L, 83L, 92L, 99L, 105L, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), d20s2 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 0L, 2L, 2L, 3L, 4L, 14L, 15L, 23L, 25L, 27L, 36L, 37L, 38L,
43L, 43L, 45L, 47L, 50L, 53L, 56L, 57L, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA), d10s1 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, 2L, 15L, 19L, 22L, 33L, 34L, 37L,
37L, 39L, 41L, 48L, 50L, 52L, 56L, 62L, 64L, 65L, 68L, 72L, 77L,
84L), d2s2 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, 4L, 4L, 4L, 4L, 4L, 7L, 9L, 9L, 12L, 12L,
14L, 17L, 17L, 23L, 24L, 24L, 24L, 26L, 26L, 30L, 33L)), class = "data.frame", row.names = c(NA,
-102L)
The function geom_step() has an argument na.rm to remove NA values, which is FALSE by default. changing this to TRUE should give you the plots that you want. Alternatively you could change the NA data to zeroes for the same effect.
The crowded x-axis is typical of what happens when the data is stored as a factor, rather than a date. This will be related to how you read in your data, which you haven't shown.

How to have a different color spectrum for two variables in r?

I am plotting two variables with the following code:
ggplot()+
geom_point(data=subset(afc_reopening, key == "mean_spend_all"),
aes(x=day_after_reopening, y=mean_spend_cases * 100, color = winner2016)) +
stat_smooth(data=subset(afc_reopening,key == "mean_spend_all"),
formula = y~as.numeric(x), se = F,
aes(x=day_after_reopening, y=mean_spend_cases*100, color = winner2016))+
geom_point(data=subset(afc_reopening, key == "new_case_rate_07da"),
aes(x=day_after_reopening, y=mean_spend_cases,),) +
stat_smooth(data=subset(afc_reopening,key == "new_case_rate_07da"),
formula = y~as.numeric(x), se = F,
aes(x=day_after_reopening, y=mean_spend_cases, color = winner2016),)+
facet_wrap(.~deciles_income,scales = 'free')+
theme(legend.position = 'top')+
ylab('Change in consumer spending relative to 14th January and Past 7 Days New Cases Average')+
labs(title = "Change in spending and Past 7 Days New Cases Average in US Counties Grouped by Income Decile") +
scale_x_continuous(limits = c(-100,100))
The code results in the following graph:
The only problem I am having with the graph is that the points for both values are in the same color. I would like to have different color spectrum for spending and for cases (with each still being grouped by the winner of the 2016 election). How do I do this in ggplot?
Here is the output of the dput() function:
structure(list(day_month_year = structure(c(NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 18364, NA, NA, NA, NA,
NA, NA, NA, NA, NA, 18392, NA, NA, NA, NA, NA, NA, NA, 18427,
18392, NA, NA, NA, 18329, NA, 18427, NA, NA, NA, NA, NA, NA,
NA, NA, 18301, NA, NA, NA, NA, NA, NA, 18294, NA, NA, NA, NA,
NA, 18441, NA, NA, NA, 18378, NA, NA, 18420, 18294, NA, NA, NA,
NA), tzone = "Europe/Prague", class = "Date"), deciles_income = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 3L, NA,
NA, NA, NA, NA, NA, NA, NA, NA, 7L, NA, NA, NA, NA, NA, NA, NA,
3L, 5L, NA, NA, NA, 2L, NA, 9L, NA, NA, NA, NA, NA, NA, NA, NA,
5L, NA, NA, NA, NA, NA, NA, 8L, NA, NA, NA, NA, NA, 6L, NA, NA,
NA, 2L, NA, NA, 3L, 9L, NA, NA, NA, NA), winner2016 = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Donald Trump",
NA, NA, NA, NA, NA, NA, NA, NA, NA, "Hillary Clinton", NA, NA,
NA, NA, NA, NA, NA, "Hillary Clinton", "Hillary Clinton", NA,
NA, NA, "Hillary Clinton", NA, "Donald Trump", NA, NA, NA, NA,
NA, NA, NA, NA, "Hillary Clinton", NA, NA, NA, NA, NA, NA, "Hillary Clinton",
NA, NA, NA, NA, NA, "Hillary Clinton", NA, NA, NA, "Hillary Clinton",
NA, NA, "Donald Trump", NA, NA, NA, NA, NA), key = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "new_case_rate_07da",
NA, NA, NA, NA, NA, NA, NA, NA, NA, "mean_spend_all", NA, NA,
NA, NA, NA, NA, NA, "new_case_rate_07da", "new_case_rate_07da",
NA, NA, NA, "mean_spend_all", NA, "mean_spend_all", NA, NA, NA,
NA, NA, NA, NA, NA, "new_case_rate_07da", NA, NA, NA, NA, NA,
NA, "new_case_rate_07da", NA, NA, NA, NA, NA, "new_case_rate_07da",
NA, NA, NA, "mean_spend_all", NA, NA, "new_case_rate_07da", "new_case_rate_07da",
NA, NA, NA, NA), mean_spend_cases = c(NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, 2.98999975990396, NA, NA,
NA, NA, NA, NA, NA, NA, NA, -0.163540740740741, NA, NA, NA, NA,
NA, NA, NA, 8.37364285714286, 4.66982142857143, NA, NA, NA, -0.0154640434782609,
NA, -0.0665955440414508, NA, NA, NA, NA, NA, NA, NA, NA, 0, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 12.0187207792208,
NA, NA, NA, -0.187623043478261, NA, NA, 7.48311044417767, 0,
NA, NA, NA, NA)), row.names = c(NA, -75L), groups = structure(list(
day_month_year = structure(c(18294, 18294, 18301, 18329,
18364, 18378, 18392, 18392, 18420, 18427, 18427, 18441, NA
), tzone = "Europe/Prague", class = "Date"), deciles_income = c(8L,
9L, 5L, 2L, 3L, 2L, 5L, 7L, 3L, 3L, 9L, 6L, NA), .rows = structure(list(
57L, 71L, 50L, 39L, 16L, 67L, 35L, 26L, 70L, 34L, 41L,
63L, c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L,
12L, 13L, 14L, 15L, 17L, 18L, 19L, 20L, 21L, 22L, 23L,
24L, 25L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 36L, 37L,
38L, 40L, 42L, 43L, 44L, 45L, 46L, 47L, 48L, 49L, 51L,
52L, 53L, 54L, 55L, 56L, 58L, 59L, 60L, 61L, 62L, 64L,
65L, 66L, 68L, 69L, 72L, 73L, 74L, 75L)), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, 13L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))

R Problems with glm-model due to missing values

I have problems with putting my data into a glm model. I think the problem is because I have many missing values in my data (below). I tried this so far:
baseformula = as.formula(df)
glm(baseformula, data = df, family = poisson(link = "log"), na.action = na.exclude)
I am getting an Error:
Error in glm.fit(x = numeric(0), y = integer(0), weights = NULL, start
= NULL, : object 'fit' not found
Can somebody help me with this? When a variable is NA in my formula, I just want the glm to ignore the NAs and use these variables the same as variables without NA.
structure(list(V1 = c(0L, 1L, 3L, 0L, 0L, 0L, 2L, 0L, 1L, 1L,
0L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 2L, 0L, 0L, 0L, 0L,
0L, 2L, 0L, 0L, 1L, 5L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 3L, 0L, 1L,
0L), V48 = c(97.33, 96.88, 85.33, 83.75, 75.58, 86.13, 83, 95.75,
88.46, 80.25, 75, 67.17, 69.33, 64.08, 70.75, 78.46, 85.58, 83.42,
96.17, 76.5, 76.42, 65.38, 69.79, 68.38, 84.67, 89.67, 91.29,
80.54, 64.63, 72.29, 76.54, 65.33, 96.92, 91.38, 88.92, 80.63,
85.5, 76.38, 76.21, 78.29, 89.29, 87.04, 78.67), V49 = c(-0.9,
-0.1, 0, 0.9, -0.2, -6.3, -4.9, -1.2, -0.3, -1.4, 7.3, 10.5,
10.8, 17.5, 10.8, 9.2, 7.3, 8.2, 10.2, 8.5, 10.4, 25.6, 26.7,
28, 20.1, 20.2, 15.7, 15.3, 21.6, 24.8, 22.4, 27.1, 14.3, 13.8,
17.1, 19.5, 22.9, 21.9, 17.2, 18.9, 16.3, 14.2, 18.5), V58 = c(0.16208333,
-0.02576069, -0.24859501, -0.39733779, -0.35568168, -0.13908246,
-0.11529523, -0.07094469, 0.07592036, 0.13803538, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), V59 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, 0.40727943, 0.44007391, 0.50582446, 0.59001139,
0.55057958, 0.53888617, 0.55019019, 0.42592698, 0.347516, 0.52019593,
0.69611622, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), V61 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, 0.04555282, 0.16109391, 0.13651381, -0.02339007,
-0.24799358, -0.14477839, -0.0845835, -0.13505766, -0.06910931,
0.05876354, 0.11372484, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA), V68 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.01575957,
-0.19924471, -0.39083879, -0.26620543, -0.10669409, -0.05650572,
0.06644096, 0.24769837, -0.11404654, -0.49358358, -0.27725445,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), V71 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, -0.1563703, -0.23797044, -0.37304736, -0.27425744,
-0.02347071, 0.36391633, 0.44316418, 0.21940339, 0.02321926,
-0.01531807, -0.05197635, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), V73 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -0.46298985,
-0.7644245, -0.82771396, -0.81243484, -0.75591058, -0.55440085,
-0.35516327, -0.05602486, -0.12290976, -0.14458255, -0.17033091
), V77 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, -0.04571093, 0.25592819, 0.35649173, 0.3507695, 0.30446594,
0.36505183, 0.54215354, 0.47808018, 0.40325075, 0.32091592, 0.09212919
)), .Names = c("V1", "V48", "V49", "V58", "V59", "V61", "V68",
"V71", "V73", "V77"), row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 60L, 61L, 62L, 63L, 64L, 65L, 66L, 67L, 68L, 69L,
70L, 152L, 153L, 154L, 155L, 156L, 157L, 158L, 159L, 160L, 161L,
162L, 244L, 245L, 246L, 247L, 248L, 249L, 250L, 251L, 252L, 253L,
254L), class = "data.frame")

How to short the length of dput

In the last question I did they pointed out that less data would be easy to read and understand as part of the reproducible example. On the way to asking again I tried to shorten the data via dput(head(data)) but I get the same as if I do dput(data) or dput(data[1:6, ]) or even dput(data)[1:6, ] (in this last case I get also the 6 first rows of the data after the whole dput)
Is there a simple way to do it? At the dput options I didn't find anything and there must be a solution to avoid deleting by hand what I do not want to show.
Here is the whole dput data:
>dput(data)
structure(list(GOterm = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L,
21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L,
34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L,
47L, 48L, 49L, 50L, 51L, 52L, 53L, 54L, 55L, 56L, 57L, 58L, 59L,
60L, 61L, 62L, 63L, 64L, 65L, 66L, 67L, 71L, 72L, 76L, 77L, 78L,
83L, 87L, 88L, 89L, 93L, 96L, 97L, 101L, 103L, 104L, 105L, 106L,
109L, 111L, 113L, 114L, 116L), .Label = c("GO:0000746", "GO:0000910",
"GO:0006091", "GO:0006259", "GO:0006351", "GO:0006399", "GO:0006412",
"GO:0006457", "GO:0006464", "GO:0006468", "GO:0006486", "GO:0006520",
"GO:0006725", "GO:0006766", "GO:0006810", "GO:0006811", "GO:0006839",
"GO:0006897", "GO:0006950", "GO:0006970", "GO:0006974", "GO:0006979",
"GO:0006986", "GO:0006997", "GO:0007005", "GO:0007010", "GO:0007029",
"GO:0007031", "GO:0007033", "GO:0007034", "GO:0007049", "GO:0007059",
"GO:0007114", "GO:0007124", "GO:0007126", "GO:0007165", "GO:0009408",
"GO:0009409", "GO:0015031", "GO:0016044", "GO:0016050", "GO:0016070",
"GO:0016071", "GO:0016072", "GO:0016192", "GO:0016567", "GO:0016568",
"GO:0016570", "GO:0019725", "GO:0030435", "GO:0031505", "GO:0032196",
"GO:0032989", "GO:0042221", "GO:0042254", "GO:0042594", "GO:0043543",
"GO:0044255", "GO:0044257", "GO:0044262", "GO:0045333", "GO:0046483",
"GO:0048193", "GO:0051169", "GO:0051186", "GO:0051276", "GO:0070271",
"GO:0000278", "GO:0000902", "GO:0002181", "GO:0005975", "GO:0006325",
"GO:0006353", "GO:0006360", "GO:0006366", "GO:0006383", "GO:0006397",
"GO:0006401", "GO:0006414", "GO:0006418", "GO:0006470", "GO:0006605",
"GO:0006629", "GO:0006865", "GO:0006869", "GO:0006873", "GO:0006887",
"GO:0006914", "GO:0008033", "GO:0008213", "GO:0008643", "GO:0009311",
"GO:0009451", "GO:0015931", "GO:0016197", "GO:0023052", "GO:0031399",
"GO:0032543", "GO:0042255", "GO:0042273", "GO:0042274", "GO:0043144",
"GO:0043934", "GO:0045454", "GO:0051052", "GO:0051321", "GO:0051603",
"GO:0051604", "GO:0051726", "GO:0055086", "GO:0070647", "GO:0000054",
"GO:0001403", "GO:0006352", "GO:0006354", "GO:0006364", "GO:0006413",
"GO:0006417", "GO:0006497", "GO:0008380", "GO:0009072", "GO:0051049",
"GO:0061025", "GO:0071554"), class = "factor"), GOdesc = structure(c(16L,
17L, 23L, 19L, 58L, 62L, 59L, 37L, 39L, 40L, 38L, 3L, 4L, 67L,
60L, 27L, 30L, 20L, 51L, 48L, 46L, 49L, 52L, 33L, 29L, 18L, 21L,
34L, 64L, 63L, 2L, 14L, 1L, 43L, 28L, 56L, 47L, 45L, 41L, 9L,
65L, 54L, 31L, 55L, 66L, 42L, 12L, 26L, 7L, 57L, 22L, 61L, 6L,
44L, 53L, 50L, 35L, 8L, 10L, 5L, 11L, 25L, 24L, 32L, 15L, 13L,
36L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), .Label = c("cell budding", "cell cycle",
"cellular amino acid and metabolic process", "cellular aromatic compound metabolic process",
"cellular carbohydrate metabolic process", "cellular component morphogenesis",
"cellular homeostasis", "cellular lipid metabolic process", "cellular membrane organization",
"cellular protein catabolic process", "cellular respiration",
"chromatin modification", "chromosome organization and biogenesis",
"chromosome segregation", "cofactor metabolic process", "conjugation",
"cytokinesis", "cytoskeleton organization and biogenesis", "DNA metabolic process",
"endocytosis", "ER organization and biogenesis", "fungal-type cell wall organization",
"generation of precursor metabolites and energy", "golgi vesicle transport",
"heterocycle metabolic process", "histone modification", "ion transport",
"meiosis", "mitchondrion organization", "mitochondrial transport",
"mRNA metabolic process", "nuclear transport", "nucleus organization",
"peroxisome organization", "protein acylation", "protein complex biogenesis",
"protein folding", "protein glycosylation", "protein modification process",
"protein phosphorylation", "protein transport", "protein ubiquitination",
"pseudohyphal growth", "response to chemical stimulus", "response to cold",
"response to DNA damage stimulus", "response to heat", "response to osmotic stress",
"response to oxidative stress", "response to starvation", "response to stress",
"response to unfolded protein", "ribosome biogenesis", "RNA metabolic process",
"rRNA metabolic process", "signal transduction", "sporulation resulting in formation of a cellular spore",
"transcription", "translation", "transport", "transposition",
"tRNA metabolic process", "vacuolar transport", "vacuole organizations",
"vesicle organization", "vesicle-mediated transport", "vitamin metabolic process"
), class = "factor"), GSA_p33_SC = c(NA, -1, NA, NA, NA, NA,
NA, 1, NA, NA, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, -1, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, -1, NA, NA, NA, -1, NA, NA,
-1, -1, NA, NA, NA, NA, NA, -1, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA), GSA_p33_X33 = c(NA, NA, -1, NA, NA, NA, NA, NA,
NA, NA, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 1, NA, NA, NA, NA, NA, NA, 1, 1, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -1,
NA, NA, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, NA,
NA), GSA_p38_SC = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
1, NA, NA, NA, -1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -1, NA, NA, NA,
NA, NA, NA, -1, NA, NA, NA, -1, NA, NA, NA, NA, NA, NA), GSA_p38_X33 = c(NA,
1, NA, NA, NA, NA, NA, 1, NA, NA, 1, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -1, 1,
1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, -1, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, -1, NA, NA, 1, NA, NA), GSA_p52_SC = c(NA, NA, NA, NA,
NA, NA, NA, 1, NA, NA, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, NA, NA, NA, NA,
-1, -1, NA, NA, NA), GSA_p52_X33 = c(NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -1,
NA, -1, NA, 1, NA, NA, NA, NA, NA, NA, 1, NA, NA, NA, -1, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, -1, NA, NA, NA, NA, NA, NA, NA, NA, -1, NA, NA, NA, -1, NA,
NA, NA, NA), GSA_p64_SC = c(NA, NA, NA, NA, NA, NA, NA, 1, NA,
NA, 1, NA, NA, -1, NA, NA, NA, NA, NA, NA, NA, -1, NA, NA, NA,
1, NA, NA, NA, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, -1, NA, NA, NA, NA, NA, -1, NA, -1, -1,
NA, NA, NA, -1, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA, -1, 1,
-1, NA, NA, NA, NA, NA, NA, NA, -1, NA, NA, NA, NA, NA, NA, NA
), GSA_p64_X33 = c(1, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1,
NA, NA, NA, NA, NA, NA, NA, NA, NA, -1, NA, NA, NA, 1, NA, NA,
NA, NA, NA, NA, -1, 1, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -1, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -1, NA, NA, NA,
NA, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, -1, -1), GSA_SC_X33 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -1, NA,
NA, NA, NA, NA, NA, NA, -1, NA, 1, NA, NA, NA, NA, NA, NA, 1,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, NA,
NA, NA, NA, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA, NA, NA,
1, NA, NA, 1, -1, NA, -1, NA, NA, NA, -1, 1, NA, NA, NA, NA,
NA, -1, NA, NA, NA, NA, NA, NA)), .Names = c("GOterm", "GOdesc",
"GSA_p33_SC", "GSA_p33_X33", "GSA_p38_SC", "GSA_p38_X33", "GSA_p52_SC",
"GSA_p52_X33", "GSA_p64_SC", "GSA_p64_X33", "GSA_SC_X33"), row.names = c(NA,
-89L), class = "data.frame")
A shortened version could be like:
structure(list(GOterm = structure(c(1L, 2L, 3L, 4L, 5L, 6L),
.Label = c("GO:0000746", "GO:0000910", "GO:0006091", "GO:0006259",
"GO:0006351", "GO:0006399"), class = "factor"),
GOdesc = structure(c(16L,17L, 23L, 19L, 58L, 62L),
.Label = c("cell budding", "cell cycle",
"cellular amino acid and metabolic process", "cellular aromatic compound
metabolic process", "cellular carbohydrate metabolic process", "cellular
component morphogenesis"), class = "factor"),
GSA_p33_SC = c(NA, -1, NA, NA, NA, NA),
GSA_p33_X33 = c(NA, NA, -1, NA, NA, NA),
GSA_p38_SC = c(NA, NA, NA, NA, NA, NA),
GSA_p38_X33 = c(NA, 1, NA, NA, NA, NA),
GSA_p52_SC = c(NA, NA, NA, NA, NA, NA),
GSA_p52_X33 = c(NA, NA, NA, NA, NA, NA),
GSA_p64_SC = c(NA, NA, NA, NA, NA, NA),
GSA_p64_X33 = c(1, NA, NA, NA, NA, NA),
GSA_SC_X33 = c(NA, NA, NA, NA, NA, NA)),
.Names = c("GOterm", "GOdesc",
"GSA_p33_SC", "GSA_p33_X33", "GSA_p38_SC", "GSA_p38_X33", "GSA_p52_SC",
"GSA_p52_X33", "GSA_p64_SC", "GSA_p64_X33", "GSA_SC_X33"), row.names = c(NA,
-6L), class = "data.frame"))
All of that extra funk is from your factor levels. If you know your problem will still be reproducible after dropping these levels, then you can consider (wait for it) droplevels:
> dput(droplevels(head(data)))
structure(list(GOterm = structure(1:6, .Label = c("GO:0000746",
"GO:0000910", "GO:0006091", "GO:0006259", "GO:0006351", "GO:0006399"
), class = "factor"), GOdesc = structure(c(1L, 2L, 4L, 3L, 5L,
6L), .Label = c("conjugation", "cytokinesis", "DNA metabolic process",
"generation of precursor metabolites and energy", "transcription",
"tRNA metabolic process"), class = "factor"), GSA_p33_SC = c(NA,
-1, NA, NA, NA, NA), GSA_p33_X33 = c(NA, NA, -1, NA, NA, NA),
GSA_p38_SC = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), GSA_p38_X33 = c(NA, 1, NA, NA, NA, NA), GSA_p52_SC = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), GSA_p52_X33 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), GSA_p64_SC = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), GSA_p64_X33 = c(1,
NA, NA, NA, NA, NA), GSA_SC_X33 = c(NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_)), .Names = c("GOterm", "GOdesc",
"GSA_p33_SC", "GSA_p33_X33", "GSA_p38_SC", "GSA_p38_X33", "GSA_p52_SC",
"GSA_p52_X33", "GSA_p64_SC", "GSA_p64_X33", "GSA_SC_X33"), row.names = c(NA,
6L), class = "data.frame")
This is more easily demonstrated in the following example:
x <- factor("A", levels = LETTERS)
x
# [1] A
# Levels: A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
dput(x)
# structure(1L, .Label = c("A", "B", "C", "D", "E", "F", "G", "H",
# "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U",
# "V", "W", "X", "Y", "Z"), class = "factor")
dput(droplevels(x))
# structure(1L, .Label = "A", class = "factor")
Another way to shorten it up would be to convert the columns to character before dput. The data can then be read back in with as.data.frame and factor levels are preserved.
First subset
> data2 <- data[sample(nrow(data), 4), ]
Then dput as characters
> d <- dput(lapply(data2, as.character))
structure(list(GOterm = c("GO:0000746", "GO:0070647", "GO:0006914",
"GO:0007010"), GOdesc = c("conjugation", NA, NA, "cytoskeleton organization and biogenesis"
), GSA_p33_SC = c(NA_character_, NA_character_, NA_character_,
NA_character_), GSA_p33_X33 = c(NA, NA, "1", "1"), GSA_p38_SC = c(NA_character_,
NA_character_, NA_character_, NA_character_), GSA_p38_X33 = c(NA_character_,
NA_character_, NA_character_, NA_character_), GSA_p52_SC = c(NA,
"-1", NA, NA), GSA_p52_X33 = c(NA, NA, NA, "1"), GSA_p64_SC = c(NA,
NA, NA, "1"), GSA_p64_X33 = c("1", NA, NA, NA), GSA_SC_X33 = c(NA,
NA, NA, "1")), .Names = c("GOterm", "GOdesc", "GSA_p33_SC", "GSA_p33_X33",
"GSA_p38_SC", "GSA_p38_X33", "GSA_p52_SC", "GSA_p52_X33", "GSA_p64_SC",
"GSA_p64_X33", "GSA_SC_X33"))
And read back in
> as.data.frame(d)

Resources