R ggplot2 grid labeling - r
I've been asked to make a bar plot from pollution data. Example data can be found here. Data structure is as follows
str(datos) 'data.frame': 55 obs. of 10 variables:
$ PROVINCIA : int 46 46 46 46 46 46 46 46 46 46 ...
$ ESTACION : Factor w/ 55 levels "Alacant-El_Pla",..: 5 1 2 3 8 23 24 21 31 22 ...
$ MAXIMO_HORARIO : num 99.5 88.5 88.5 90 97.5 87.3 96 92.5 88 20 ...
$ PROMEDIO_DIARIO : num NA NA NA NA NA NA NA NA NA NA ...
$ MAXIMO_OCTOHORARIO : num 103.9 83.1 80.9 75.7 95.1 ...
$ VARIACION_MAX_HOR : num -25.2 -6.5 -6.7 -1.2 -13.2 -15.4 -12.7
-29.5 -16.3 NA ...
$ VARIACION_PRM_DIA : num NA NA NA NA NA NA NA NA NA NA ...
$ OSCILACION_DIARIO : num 16.5 63.7 53.3 62 26.8 31.3 29.2 15 52 20 ...
$ ESTACIONALIDAD_MAX : num -38.2 -39.6 -36.8 -38.8 -37.6 -51.8 -35.6 -40.3 -42.9 -86.5 ...
$ ESTACIONALIDAD_MAX-1: num NA NA NA NA NA NA NA NA NA NA ...
I've tried to use ggplot2 geom_bar geometry and facetting with the following code
datos=read.csv("data.csv",header=T,sep=",", na.strings="-99.9")
ggplot(datos, aes(ESTACION,MAXIMO_HORARIO, fill = factor(MAXIMO_HORARIO))) +
geom_bar(stat="identity") +
theme(axis.text.x = element_text(angle=90, size=10)) +
facet_grid(PROVINCIA ~ .)
obtaining this output
This is on the right way but I would like that every facet (group) shows its own values and not empty space that correspond to data in another facet, and also with the right labels in each grid. I can split data into three parts and produce three different plots but I'd like to build just a single file with the three plots in it.
Desired output would look like
EDIT: Output of dput(datos)
**>
dput(datos)
structure(list(PROVINCIA = c(46L, 46L, 46L, 46L, 46L, 46L, 46L,
46L, 46L, 46L, 46L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L,
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L), ESTACION = structure(c(5L, 1L, 2L, 3L,
8L, 23L, 24L, 21L, 31L, 22L, 41L, 27L, 12L, 13L, 14L, 15L, 16L,
18L, 28L, 29L, 19L, 37L, 39L, 26L, 49L, 52L, 53L, 54L, 55L, 4L,
7L, 6L, 9L, 10L, 11L, 17L, 20L, 33L, 25L, 30L, 32L, 36L, 35L,
34L, 38L, 42L, 43L, 44L, 45L, 46L, 47L, 48L, 50L, 51L, 40L), .Label = c("Alacant-El_Pla",
"Alacant-Florida_Babel", "Alacant-Rabassa", "Albalat_dels_Tarongers",
"Alcoi-Verge_dels_Lliris", "Algar_de_Pal", "Alzira", "Benidorm",
"Benig", "Bull", "Burjassot-Facultats", "Burriana", "Castell1",
"Castell2", "Castell3", "Castell4", "Caudete_de_las_Fuentes",
"Cirat", "Coratxar", "Cortes_de_Pall", "Elda-Lacy", "El_Pin",
"Elx-Agroalimentari", "Elx-Parc_de_Bombers", "Gandia", "La_Vall_d",
"Lluce", "Morella", "Onda", "Ontinyent", "Orihuela", "Paterna-CEAM",
"Quart_de_Poblet", "Sagunt-CEA", "Sagunt-Nord", "Sagunt-Port",
"Sant_Jordi", "Torrebaja", "Torre_Endom", "Torrent-El_Vedat",
"Torrevieja", "Val1", "Val2", "Val3", "Val4", "Val5", "Val6",
"Val7", "Vilafranca", "Vilamarxant", "Villar_del_Arzobispo",
"Vinaros", "VinarosP", "Viver", "Zorita"), class = "factor"),
MAXIMO_HORARIO = c(99.5, 88.5, 88.5, 90, 97.5, 87.3, 96,
92.5, 88, 20, 20, 81.5, 99, 91.7, 93.5, 81.5, 90.5, 84.5,
100.3, 96.3, 41.7, 91.5, 57.3, NA, 93, 111.5, 86.8, NA, 100.3,
21.9, 80.5, 111, 98.7, 87.3, 89.7, 87.5, 41.7, 81.7, NA,
20, 84.8, 92, 88.7, NA, 74, NA, 95, 20.5, 85.7, 80, 82.3,
76, 20, 90.8, NA), PROMEDIO_DIARIO = c(NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, 21.9, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), MAXIMO_OCTOHORARIO = c(103.9, 83.1,
80.9, 75.7, 95.1, 82.9, 90.2, 83.5, 85, NA, NA, 77.1, 76.7,
91.4, 73.1, 65.1, 96.6, 81.1, 110.5, 91.1, NA, 87.8, 54.8,
NA, 95.1, 116.8, 79.9, NA, 107.2, 73.9, 70.5, 102.8, 100.5,
77.5, 80.9, 86.9, NA, 70.5, NA, NA, 73.5, 86.9, 86, NA, 83.5,
NA, 84.5, 20.5, 90.8, 71.5, 67.5, 64.5, NA, 91.4, NA), VARIACION_MAX_HOR = c(-25.2,
-6.5, -6.7, -1.2, -13.2, -15.4, -12.7, -29.5, -16.3, NA,
NA, -32.5, -11.5, -22.3, -19.5, -22.3, -25.3, -24.7, -14.7,
-18, NA, -12.8, -36, NA, -27.3, -11.4, -15.7, NA, -21.4,
-103.6, -26, -24.5, -33.1, -30, -31, -17.8, NA, -15.1, NA,
NA, -23.5, -32.5, -16.1, NA, -32.3, NA, -28.2, 0.3, -30.5,
-17.3, -18.4, -19.7, NA, -31.2, NA), VARIACION_PRM_DIA = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA), OSCILACION_DIARIO = c(16.5,
63.7, 53.3, 62, 26.8, 31.3, 29.2, 15, 52, 20, 20, 51.8, 85.7,
27.5, 80, 74.8, 45, 48.3, 12.5, 21.6, 41.7, 41.8, 35.3, NA,
26.5, 27.1, 64.2, NA, 58.6, 3.9, 39.2, 39.3, 32.9, 22.6,
43.4, 17.3, 41.7, 46.9, NA, 20, 50.8, 58.2, 64.5, NA, 2.7,
NA, 40.2, 1.5, 25.9, 30.5, 58.6, 31, 20, 15.8, NA), ESTACIONALIDAD_MAX = c(-38.2,
-39.6, -36.8, -38.8, -37.6, -51.8, -35.6, -40.3, -42.9, -86.5,
-83.6, -50.6, -35, -46.8, -45, -57.1, -31.4, -49.7, -35.5,
-45.7, -75.2, -44.1, -62.6, NA, -48.4, -10.8, -39.3, NA,
-38.1, -86.4, -53.7, -16.5, -42.3, -42.2, -38.1, -48.7, -68.2,
-45.4, NA, -87.6, -43.8, -44.2, -43.1, NA, -55.5, NA, -33.1,
-86.1, -38.3, -44.4, -41.6, -38.2, -85.5, -50.1, NA), ESTACIONALIDAD_MAX.1 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -71.11,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), .Names = c("PROVINCIA",
"ESTACION", "MAXIMO_HORARIO", "PROMEDIO_DIARIO", "MAXIMO_OCTOHORARIO",
"VARIACION_MAX_HOR", "VARIACION_PRM_DIA", "OSCILACION_DIARIO",
"ESTACIONALIDAD_MAX", "ESTACIONALIDAD_MAX.1"), class = "data.frame", row.names = c(NA,
-55L))
**
Sounds like you want facet_wrap rather than facet_grid. Try
ggplot(datos, aes(ESTACION,MAXIMO_HORARIO, fill = factor(MAXIMO_HORARIO))) +
geom_bar(stat="identity") +
theme(axis.text.x = element_text(angle=90, size=10)) +
facet_wrap(~PROVINCIA , scales="free", ncol=1)
to get
facet_grid() is not designed for what you want. Making the three plots separately is the right approach. But with the gridExtra package it is easy to combine these plot elements (the gridExtra package calls them "grobs") into a single plot or single file.
require(ggplot2)
require(gridExtra)
#toy data
dat <- data.frame(x=1:20, y=sample(1:20, size=20, replace=T), group=sample(1:3, size=20, replace=T))
#making each "grob"
p1 <- ggplot(subset(dat, group==1), aes(factor(x), y)) +
geom_bar(stat='identity')
p2 <- ggplot(subset(dat, group==2), aes(factor(x), y)) +
geom_bar(stat='identity')
p3 <- ggplot(subset(dat, group==3), aes(factor(x), y)) +
geom_bar(stat='identity')
#combine them into a single stack of plots
pAll <- grid.arrange(p1, p2, p3, ncol=1)
pAll
Note for this approach to work, your x-variable in the parent data.frame will have to be a string or a numeric, not a factor. (For numerics, you have to make it a factor after subsetting: that's the only way ggplot2 will know that you don't want to show the gaps where each subset has no data. For strings, this won't be a problem and the x-axis doesn't need to be a factor at any point.)
Related
Formattable - Export to PDF
I'd like to accompany my ggplot2-visualisations with nice looking tables. Or in some cases just display the tables. My target audience is not fond of just being presented a table. It needs some clear indicators of 'where to look' so to say. For that I've been using formattable (see pct_change... columns). I can export the table below in an image format, but I've been unable to fully reproduce it as a pdf. When I export it as a html, then print from the browser, I lose the colour formatting (see pct_formatter-code at bottom). I've tried Edge, Firefox and Chrome. Turning on print with colour does not help. So in addition to being cumbersome (the table below is one of a group of 150) to print via the browser, it also doens't give me the desired result. I've also found a workaround here on Stackoverflow where someone wrote an 'export_formattable' function. This does indeed export in pdf directly from R. However I lose again the colour and when I open it in Adobe Illustrator, I also lose the arrow icons, they become like [X]-boxes. So that doesn't work either. I haven't really tried Rmarkdown to be honest, simply because I'm quite unskilled in using it. From what I tried, it's seems it's not made to simply output a table in the way, shape or size I want. I don't want create a (reproducible) rapport. I just need a 'nicely' formatted pdf-table (or .svg!!) that I will then manually combine with a visualisation in InDesign to make the desired document. Thanks for reading, hope there's some way to help! pct_formatter <- formatter("span", style = x ~ style( color = ifelse( x > 0, "#39870c", ifelse( x < 0, "#d52b1e", "black") ) ), x ~ icontext(ifelse(x>0, "arrow-up", "arrow-down"), x) ) Data: structure(list(Year = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("2019", "2020", "2021"), class = "factor"), Month = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9), Totaal_permaand = c(2243L, 2007L, 2884L, 2206L, 2701L, 2325L, 1452L, 1721L, 3152L, 3067L, 3097L, 2554L, 3303L, 2948L, 3325L, 3173L, 3504L, 3209L, 5924L, 4637L, 5735L, 6206L, 4252L, 3479L, 4312L, 3128L, 4529L, 4170L, 3814L, 5587L, 9281L, 4615L, 4426L), abs_change.M = c(NA, -236L, 877L, -678L, 495L, -376L, -873L, 269L, 1431L, -85L, 30L, -543L, 749L, -355L, 377L, -152L, 331L, -295L, 2715L, -1287L, 1098L, 471L, -1954L, -773L, 833L, -1184L, 1401L, -359L, -356L, 1773L, 3694L, -4666L, -189L ), pct_change.M = c(NA, -10.5, 43.7, -23.5, 22.4, -13.9, -37.5, 18.5, 83.1, -2.7, 1, -17.5, 29.3, -10.7, 12.8, -4.6, 10.4, -8.4, 84.6, -21.7, 23.7, 8.2, -31.5, -18.2, 23.9, -27.5, 44.8, -7.9, -8.5, 46.5, 66.1, -50.3, -4.1), abs_change.Y = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1060L, 941L, 441L, 967L, 803L, 884L, 4472L, 2916L, 2583L, 3139L, 1155L, 925L, 1009L, 180L, 1204L, 997L, 310L, 2378L, 3357L, -22L, -1309L), pct_change.Y = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 47.3, 46.9, 15.3, 43.8, 29.7, 38, 308, 169.4, 81.9, 102.3, 37.3, 36.2, 30.5, 6.1, 36.2, 31.4, 8.8, 74.1, 56.7, -0.5, -22.8), abs_change.Y2 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 2069L, 1121L, 1645L, 1964L, 1113L, 3262L, 7829L, 2894L, 1274L), pct_change.Y2 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 92.2, 55.9, 57, 89, 41.2, 140.3, 539.2, 168.2, 40.4), CS = c(2243L, 4250L, 7134L, 9340L, 12041L, 14366L, 15818L, 17539L, 20691L, 23758L, 26855L, 29409L, 3303L, 6251L, 9576L, 12749L, 16253L, 19462L, 25386L, 30023L, 35758L, 41964L, 46216L, 49695L, 4312L, 7440L, 11969L, 16139L, 19953L, 25540L, 34821L, 39436L, 43862L), abs_change.SY = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1060L, 2001L, 2442L, 3409L, 4212L, 5096L, 9568L, 12484L, 15067L, 18206L, 19361L, 20286L, 1009L, 1189L, 2393L, 3390L, 3700L, 6078L, 9435L, 9413L, 8104L), pct_change.SY = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 47.3, 47.1, 34.2, 36.5, 35, 35.5, 60.5, 71.2, 72.8, 76.6, 72.1, 69, 30.5, 19, 25, 26.6, 22.8, 31.2, 37.2, 31.4, 22.7), abs_change.SY2 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 2069L, 3190L, 4835L, 6799L, 7912L, 11174L, 19003L, 21897L, 23171L), pct_change.SY2 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 92.2, 75.1, 67.8, 72.8, 65.7, 77.8, 120.1, 124.8, 112)), row.names = c(NA, -33L), class = c("tbl_df", "tbl", "data.frame" ))
Have you tried to open your exported PDF in Inkscape? I have edited several PDF files in Inkscape and not lost anything from the original PDF.
R, extrapolate average scores from graph
I have a graph like this: With data that created it like this: test<-structure(list(study_id = c(1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 13, 13, 13, 13, 13, 34, 34, 34, 34, 34, 40, 40, 40, 40, 40, 44, 44, 44, 44, 44, 47, 47, 47, 47, 47, 49, 49, 49, 49, 49, 51, 51, 51, 51, 51, 61, 61, 61, 61, 61, 66, 66, 66, 66, 66, 67, 67, 67, 67, 67, 68, 68, 68, 68, 68, 72, 72, 72, 72, 72, 75, 75, 75, 75, 75, 80, 80, 80, 80, 80, 84, 84, 84, 84, 84, 86, 86, 86, 86, 86, 94, 94, 94, 94, 94, 95, 95, 95, 95, 95, 101, 101, 101, 101, 101, 105, 105, 105, 105, 105, 111, 111, 111, 111, 111, 117, 117, 117, 117, 117, 123, 123, 123, 123, 123, 124, 124, 124, 124, 124, 125, 125, 125, 125, 125, 126, 126, 126, 126, 126, 131, 131, 131, 131, 131, 145, 145, 145, 145, 145, 153, 153, 153, 153, 153, 154, 154, 154, 154, 154, 155, 155, 155, 155, 155, 156, 156, 156, 156, 156, 161, 161, 161, 161, 161, 162, 162, 162, 162, 162, 166, 166, 166, 166, 166, 167, 167, 167, 167, 167, 169, 169, 169, 169, 169, 172, 172, 172, 172, 172, 175, 175, 175, 175, 175, 179, 179, 179, 179, 179, 180, 180, 180, 180, 180, 184, 184, 184, 184, 184, 185, 185, 185, 185, 185, 188, 188, 188, 188, 188, 190, 190, 190, 190, 190, 192, 192, 192, 192, 192, 194, 194, 194, 194, 194, 195, 195, 195, 195, 195, 197, 197, 197, 197, 197, 199, 199, 199, 199, 199, 203, 203, 203, 203, 203, 207, 207, 207, 207, 207, 210, 210, 210, 210, 210, 211, 211, 211, 211, 211, 212, 212, 212, 212, 212, 217, 217, 217, 217, 217, 221, 221, 221, 221, 221, 223, 223, 223, 223, 223, 227, 227, 227, 227, 227, 228, 228, 228, 228, 228, 229, 229, 229, 229, 229, 239, 239, 239, 239, 239, 244, 244, 244, 244, 244, 253, 253, 253, 253, 253, 256, 256, 256, 256, 256, 257, 257, 257, 257, 257, 259, 259, 259, 259, 259, 266, 266, 266, 266, 266, 272, 272, 272, 272, 272, 275, 275, 275, 275, 275, 277, 277, 277, 277, 277, 278, 278, 278, 278, 278, 284, 284, 284, 284, 284, 288, 288, 288, 288, 288, 290, 290, 290, 290, 290, 291, 291, 291, 291, 291, 292, 292, 292, 292, 292, 294, 294, 294, 294, 294, 295, 295, 295, 295, 295, 296, 296, 296, 296, 296, 299, 299, 299, 299, 299, 300, 300, 300, 300, 300, 301, 301, 301, 301, 301, 303, 303, 303, 303, 303, 305, 305, 305, 305, 305, 306, 306, 306, 306, 306, 307, 307, 307, 307, 307, 309, 309, 309, 309, 309, 313, 313, 313, 313, 313, 315, 315, 315, 315, 315, 316, 316, 316, 316, 316, 320, 320, 320, 320, 320, 324, 324, 324, 324, 324, 331, 331, 331, 331, 331, 336, 336, 336, 336, 336, 337, 337, 337, 337, 337, 348, 348, 348, 348, 348, 349, 349, 349, 349, 349, 352, 352, 352, 352, 352, 353, 353, 353, 353, 353, 367, 367, 367, 367, 367, 373, 373, 373, 373, 373, 382, 382, 382, 382, 382, 387, 387, 387, 387, 387, 388, 388, 388, 388, 388, 389, 389, 389, 389, 389, 392, 392, 392, 392, 392, 398, 398, 398, 398, 398, 401, 401, 401, 401, 401, 402, 402, 402, 402, 402, 404, 404, 404, 404, 404, 405, 405, 405, 405, 405, 410, 410, 410, 410, 410, 411, 411, 411, 411, 411, 412, 412, 412, 412, 412, 413, 413, 413, 413, 413, 414, 414, 414, 414, 414, 415, 415, 415, 415, 415, 420, 420, 420, 420, 420, 428, 428, 428, 428, 428, 431, 431, 431, 431, 431, 433, 433, 433, 433, 433, 434, 434, 434, 434, 434, 436, 436, 436, 436, 436), Time = structure(c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L), .Label = c("1", "2", "3", "4", "5"), class = "factor"), Score = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 3, NA, NA, NA, NA, 0, 0, NA, NA, NA, NA, NA, NA, NA, NA, 4, 7, NA, NA, NA, NA, NA, NA, NA, NA, 4, NA, NA, NA, NA, 0, NA, NA, NA, NA, 0, NA, NA, NA, NA, 0, 0, 7, 8, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 5, 7, NA, NA, NA, 0, NA, NA, NA, NA, 0, 5, 8, NA, NA, 7, 8, NA, NA, NA, 0, 0, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 4, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0, NA, NA, NA, NA, 4, NA, NA, NA, NA, NA, NA, NA, NA, NA, 2, 8, 8, NA, NA, 3, NA, NA, NA, NA, 1, NA, NA, NA, NA, 0, 9, NA, NA, NA, 2, NA, NA, NA, NA, NA, NA, NA, NA, NA, 2, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0, 0, 1, 5, 5, NA, NA, NA, NA, NA, 3, 4, 4, NA, NA, 0, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0, NA, NA, NA, NA, 0, 0, 0, 1, 1, 9, 9, NA, NA, NA, NA, NA, NA, NA, NA, 0, 2, 5, 5, NA, NA, NA, NA, NA, NA, 0, 0, 0, 0, 0, 0, NA, NA, NA, NA, NA, NA, NA, NA, NA, 6, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0, NA, NA, NA, NA, 0, NA, NA, NA, NA, 7, NA, NA, NA, NA, 5, NA, NA, NA, NA, NA, NA, NA, NA, NA, 7, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0, 4, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0, 1, 1, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 8, 8, NA, NA, NA, 0, NA, NA, NA, NA, 0, NA, NA, NA, NA, 0, 3, NA, NA, NA, 6, NA, NA, NA, NA, NA, NA, NA, NA, NA, 5, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 7, NA, NA, NA, NA, 0, 0, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 3, 8, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0, NA, NA, NA, NA, 5, 5, 5, NA, NA, 0, NA, NA, NA, NA, 2, 7, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0, 0, NA, NA, NA, NA, NA, NA, NA, NA, 0, 3, NA, NA, NA, 0, NA, NA, NA, NA, 7, 7, 8, NA, NA, 0, NA, 0, NA, NA, 2, 4, 4, NA, NA), TimeBetweenScans = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 316, NA, NA, NA, NA, 113, 139, NA, NA, NA, NA, NA, NA, NA, NA, 335, 660, NA, NA, NA, NA, NA, NA, NA, NA, 104, NA, NA, NA, NA, 7, NA, NA, NA, NA, 42, NA, NA, NA, NA, 30, 84, 467, 826, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 643, 1794, NA, NA, NA, 404, NA, NA, NA, NA, 40, 221, 394, NA, NA, 171, 320, NA, NA, NA, 51, 227, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 449, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 56, NA, NA, NA, NA, 104, NA, NA, NA, NA, NA, NA, NA, NA, NA, 79, 989, 1097, NA, NA, 116, NA, NA, NA, NA, 65, NA, NA, NA, NA, 39, 411, NA, NA, NA, 1193, NA, NA, NA, NA, NA, NA, NA, NA, NA, 142, NA, NA, NA, NA, NA, NA, NA, NA, NA, 106, 216, 266, 497, 575, NA, NA, NA, NA, NA, 221, 474, 796, NA, NA, 18, NA, NA, NA, NA, 87, 1565, NA, NA, NA, NA, NA, NA, NA, NA, 36, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 207, 529, NA, NA, NA, NA, NA, NA, NA, NA, 125, NA, NA, NA, NA, 137, 372, 941, 1102, 1225, 927, 1006, NA, NA, NA, NA, NA, NA, NA, NA, 63, 429, 533, 567, NA, NA, NA, NA, NA, NA, 156, 447, 470, 1204, 1266, 32, NA, NA, NA, NA, NA, NA, NA, NA, NA, 411, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 201, NA, NA, NA, NA, 160, NA, NA, NA, NA, 166, NA, NA, NA, NA, 459, NA, NA, NA, NA, NA, NA, NA, NA, NA, 212, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 50, 313, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 312, 530, 783, 1574, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1627, 1706, NA, NA, NA, 354, NA, NA, NA, NA, 33, NA, NA, NA, NA, 62, 130, NA, NA, NA, 1416, NA, NA, NA, NA, 121, NA, NA, NA, NA, 842, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 24, 64, 82, 122, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 250, NA, NA, NA, NA, 174, 300, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 216, 264, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 17, NA, NA, NA, NA, 214, 268, 388, NA, NA, 24, NA, NA, NA, NA, 149, 382, NA, NA, NA, NA, NA, NA, NA, NA, 8, NA, NA, NA, NA, 91, 188, NA, NA, NA, NA, NA, NA, NA, NA, 72, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 9, 38, NA, NA, NA, NA, NA, NA, NA, NA, 13, 138, NA, NA, NA, 42, NA, NA, NA, NA, 771, 1200, 1512, NA, NA, 113, 166, 180, NA, NA, 122, 475, 640, NA, NA), Groups = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Two", NA, NA, NA, NA, "Zero", "Zero", NA, NA, NA, NA, NA, NA, NA, NA, "Two", "Two", NA, NA, NA, NA, NA, NA, NA, NA, "Two", NA, NA, NA, NA, "Zero", NA, NA, NA, NA, "Zero", NA, NA, NA, NA, "Two", "Two", "Two", "Two", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Two", "Two", NA, NA, NA, "One", NA, NA, NA, NA, "Two", "Two", "Two", NA, NA, "Two", "Two", NA, NA, NA, "Zero", "Zero", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Two", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Zero", NA, NA, NA, NA, "Two", NA, NA, NA, NA, NA, NA, NA, NA, NA, "Two", "Two", "Two", NA, NA, "Two", NA, NA, NA, NA, "Two", NA, NA, NA, NA, "Two", "Two", NA, NA, NA, "One", NA, NA, NA, NA, NA, NA, NA, NA, NA, "Two", NA, NA, NA, NA, NA, NA, NA, NA, NA, "Two", "Two", "Two", "Two", "Two", NA, NA, NA, NA, NA, "Two", "Two", "Two", NA, NA, "Zero", NA, NA, NA, NA, "Zero", "Zero", NA, NA, NA, NA, NA, NA, NA, NA, "Zero", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Zero", "Zero", NA, NA, NA, NA, NA, NA, NA, NA, "Zero", NA, NA, NA, NA, "One", "One", "One", "One", "One", "Two", "Two", NA, NA, NA, NA, NA, NA, NA, NA, "Two", "Two", "Two", "Two", NA, NA, NA, NA, NA, NA, "One", "One", "One", "One", "One", "Zero", NA, NA, NA, NA, NA, NA, NA, NA, NA, "Two", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Zero", NA, NA, NA, NA, "Zero", NA, NA, NA, NA, "Two", NA, NA, NA, NA, "Two", NA, NA, NA, NA, NA, NA, NA, NA, NA, "Two", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Two", "Two", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "One", "One", "One", "One", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Two", "Two", NA, NA, NA, "One", NA, NA, NA, NA, "Zero", NA, NA, NA, NA, "Two", "Two", NA, NA, NA, "Two", NA, NA, NA, NA, "Zero", NA, NA, NA, NA, "Two", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Two", NA, NA, NA, NA, "Zero", "One", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Two", "Two", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Zero", NA, NA, NA, NA, "Two", "Two", "Two", NA, NA, "Zero", NA, NA, NA, NA, "Two", "Two", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Zero", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Zero", "Zero", NA, NA, NA, NA, NA, NA, NA, NA, "Zero", "Two", NA, NA, NA, "Zero", NA, NA, NA, NA, "Two", "Two", "Two", NA, NA, "One", "One", "One", NA, NA, "Two", "Two", "Two", NA, NA)), class = c("spec_tbl_df", "tbl_df", "tbl", "data.frame"), row.names = c(NA, -630L), spec = structure(list( cols = list(study_id = structure(list(), class = c("collector_double", "collector")), Time = structure(list(), class = c("collector_double", "collector")), Score = structure(list(), class = c("collector_double", "collector")), TimeBetweenScans = structure(list(), class = c("collector_double", "collector")), Groups = structure(list(), class = c("collector_character", "collector"))), default = structure(list(), class = c("collector_guess", "collector")), skip = 1L), class = "col_spec")) And Code that created the graph like this: I grouped the study id's so that a dotted line was drawn connecting all the scores from each individual patient. So each line is one person. test%>%ggplot(aes(x=TimeBetweenScans,y=Score, group=study_id, color=Time, shape=Groups))+geom_point(size=3)+geom_line(color="Black", linetype="dotted")+labs(title = "Oulu Score vs Time",y="Oulu Score",x="Time from Post-Op Scan to Follow Up Scan", color="Follow-up Scan") I was asked to get the "average" score at different timeframes. I.e. the average score at 1 year followup (TimeBetweenScans = "365"), 2 years, 3 years, and 4 years. So for instance, eyeballing it, you'd take all the dotted lines that cross this red line I drew at the 1 year mark, figure out where they were in the Y axis when they crossed that line, and average their "score". If I had rows that contained '365' in the "TimeBetweenScans" column, I'd write something like: test%>%filter(TimeBetweenScans=="365")%>%summarise(MeanScore=mean(Score)) That code would select only the data right at the year mark and average the y axis score for me. But since 365 isn't actually ever in a row, and it only exists when those dotted lines cross it, I need to extrapolate what it WOULD be for that person at '365'. Does that make sense? If so, how can I do it?
Here is an idea. I filtered the nearest days around the desired time (year_in_days) for each study_id. Then I calculated a regression line between these points and predicted the Score for the year_in_days. In a last step I calculated the mean over all predictions. You might get a lot of warnings while filtering because a lot study_id groups won't have any value - just NA. Code # Time you are looking for year_in_days = 100 test %>% group_by(study_id) %>% group_modify(~{ .x %>% # filter inside each group the nearest time to year_in_days (lower and upper) filter((TimeBetweenScans %in% min(TimeBetweenScans[TimeBetweenScans > year_in_days], na.rm = T)) | (TimeBetweenScans %in% max(TimeBetweenScans[TimeBetweenScans < year_in_days], na.rm = T))) %>% # filter groups with two meassurments and values for Score filter(n() == 2 & !is.na(Score)) }) %>% ungroup() %>% group_by(study_id) %>% group_modify(~{ # for each group predict the value at year "year_in_days" broom::tidy(predict(lm(Score ~ TimeBetweenScans, .x), data.frame(TimeBetweenScans = c(year_in_days)))) }) %>% ungroup() %>% # calculate mean score over all predictions summarise(mean(x)) Output # A tibble: 1 x 1 `mean(x)` <dbl> 1 1.14
R Problems with glm-model due to missing values
I have problems with putting my data into a glm model. I think the problem is because I have many missing values in my data (below). I tried this so far: baseformula = as.formula(df) glm(baseformula, data = df, family = poisson(link = "log"), na.action = na.exclude) I am getting an Error: Error in glm.fit(x = numeric(0), y = integer(0), weights = NULL, start = NULL, : object 'fit' not found Can somebody help me with this? When a variable is NA in my formula, I just want the glm to ignore the NAs and use these variables the same as variables without NA. structure(list(V1 = c(0L, 1L, 3L, 0L, 0L, 0L, 2L, 0L, 1L, 1L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 2L, 0L, 0L, 0L, 0L, 0L, 2L, 0L, 0L, 1L, 5L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 3L, 0L, 1L, 0L), V48 = c(97.33, 96.88, 85.33, 83.75, 75.58, 86.13, 83, 95.75, 88.46, 80.25, 75, 67.17, 69.33, 64.08, 70.75, 78.46, 85.58, 83.42, 96.17, 76.5, 76.42, 65.38, 69.79, 68.38, 84.67, 89.67, 91.29, 80.54, 64.63, 72.29, 76.54, 65.33, 96.92, 91.38, 88.92, 80.63, 85.5, 76.38, 76.21, 78.29, 89.29, 87.04, 78.67), V49 = c(-0.9, -0.1, 0, 0.9, -0.2, -6.3, -4.9, -1.2, -0.3, -1.4, 7.3, 10.5, 10.8, 17.5, 10.8, 9.2, 7.3, 8.2, 10.2, 8.5, 10.4, 25.6, 26.7, 28, 20.1, 20.2, 15.7, 15.3, 21.6, 24.8, 22.4, 27.1, 14.3, 13.8, 17.1, 19.5, 22.9, 21.9, 17.2, 18.9, 16.3, 14.2, 18.5), V58 = c(0.16208333, -0.02576069, -0.24859501, -0.39733779, -0.35568168, -0.13908246, -0.11529523, -0.07094469, 0.07592036, 0.13803538, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), V59 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.40727943, 0.44007391, 0.50582446, 0.59001139, 0.55057958, 0.53888617, 0.55019019, 0.42592698, 0.347516, 0.52019593, 0.69611622, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), V61 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.04555282, 0.16109391, 0.13651381, -0.02339007, -0.24799358, -0.14477839, -0.0845835, -0.13505766, -0.06910931, 0.05876354, 0.11372484, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), V68 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.01575957, -0.19924471, -0.39083879, -0.26620543, -0.10669409, -0.05650572, 0.06644096, 0.24769837, -0.11404654, -0.49358358, -0.27725445, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), V71 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -0.1563703, -0.23797044, -0.37304736, -0.27425744, -0.02347071, 0.36391633, 0.44316418, 0.21940339, 0.02321926, -0.01531807, -0.05197635, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), V73 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -0.46298985, -0.7644245, -0.82771396, -0.81243484, -0.75591058, -0.55440085, -0.35516327, -0.05602486, -0.12290976, -0.14458255, -0.17033091 ), V77 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -0.04571093, 0.25592819, 0.35649173, 0.3507695, 0.30446594, 0.36505183, 0.54215354, 0.47808018, 0.40325075, 0.32091592, 0.09212919 )), .Names = c("V1", "V48", "V49", "V58", "V59", "V61", "V68", "V71", "V73", "V77"), row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 60L, 61L, 62L, 63L, 64L, 65L, 66L, 67L, 68L, 69L, 70L, 152L, 153L, 154L, 155L, 156L, 157L, 158L, 159L, 160L, 161L, 162L, 244L, 245L, 246L, 247L, 248L, 249L, 250L, 251L, 252L, 253L, 254L), class = "data.frame")
MANOVA with variables from different datasets
This question was already asked on stats.stackexchange, but no one answered. Since I'm not sure which forum is the appropriate one, I post this here again with some data. I have done experiments on various characteristics of tree bark and now want to compare in how far the five examined tree species differ in regards to the assessed parameters. So, it was suggested that I should use a MANOVA to analyse my data and it seems reasonable to me. My analysis is conducted in R. However, unlike most examples I've found on how to do a MANOVA (i.e. here, here, here), my data stems from different measurements and from different individuals. Now, I've found only this thread discussing unequal sample sizes, but this targets only sample sizes within the explaining factor. To illustrate a bit further, imagine I have per tree species... 9 measurements of the bark roughness. 4 measurements of the bark thickness, 3 pH measurements, 5 measurements of the water-holding capacity, 5 measurements of the water retention. Of course, I could do separate ANOVAs for each of these variables (and I already did), but I think there should be some advantages in a MANOVA, right? My Question: Would a MANOVA be appropriate for such kind of data? Can I just ignore my different variable sizes? Is there an alternative way to do this or rather an alternative statistic test? Does my small sample size matter? My results so far: In R, I just put all the variables into one data.frame and filled the missing values due to unequal sample size by NAs (that's why there is the nums column in my data.frame below). Then, I ran a MANOVA like this: pH + water content + thickness + roughness ~ tree species with the manova function. Example Data: manova_df = structure(list(abbr = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), .Label = c("AS", "BU", "CL", "MB", "PR" ), class = "factor"), nums = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L), comb_rugosity = c(3.44, 2.29, 5.21, 1.45, 2.84, 4.25, 1.54, 2.97, 1.38, 2.45, 9.44, 0, 0.58, 7.71, 5.53, 0.84, 1.22, 1, 10.83, 15.77, 5.5, 8.49, 10.46, 9.16, 5.52, 6.55, 1.77, 10.68, 13.43, 20.8, 8.82, 18.09, 15.1, 15.41, 16.3, 13.2, 2.67, 0.95, 1.49, 2.7, 0, 0.92, 0.83, 0, 1.89), bark_mm = c(9.59, 4.17, 17.23, 8.49, 3.58, NA, NA, NA, NA, 8.06, 13.53, 6.33, 10.96, 12.14, NA, NA, NA, NA, 17.94, 7.33, 10.54, 14.68, 16.66, NA, NA, NA, NA, 8.52, 8.72, 7.57, 11.89, 6.41, NA, NA, NA, NA, 2.59, 9, 3.26, 5.81, NA, NA, NA, NA, NA), pH = c(6.5, 7.33, 8.17, NA, NA, NA, NA, NA, NA, 7.84, 3.71, 12.47, 4.39, NA, NA, NA, NA, NA, 11.04, 6.22, 5.41, 4.29, NA, NA, NA, NA, NA, 9.26, 11.18, 6.3, NA, NA, NA, NA, NA, NA, 8.42, 7.75, 4.33, NA, NA, NA, NA, NA, NA), whc = c(192, 251, 166, 170, 466, NA, NA, NA, NA, 308, 187, 595, 324, 364, NA, NA, NA, NA, 171, 406, 790, 292, 579, NA, NA, NA, NA, 672, 251, 700, 245, 260, 485, 383, NA, NA, 325, 481, 338, 476, 968, NA, NA, NA, NA), ret = c(83, 90, 286, 309, 374, NA, NA, NA, NA, 109, 159, 98, 164, 636, NA, NA, NA, NA, 144, 234, 383, 178, 446, NA, NA, NA, NA, 275, 56, 178, 107, 125, 367, 137, NA, NA, 132, 120, 142, 147, 330, NA, NA, NA, NA)), row.names = c(NA, -45L), class = c("tbl_df", "tbl", "data.frame")) Which looks like this (where abbr is the tree species, nums is the number of the measurement per tree species and the rest are the tree parameters): > manova_df # A tibble: 45 x 7 abbr nums comb_rugosity bark_mm pH whc ret <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl> 1 AS 1 3.44 9.59 6.5 192 83 2 AS 2 2.29 4.17 7.33 251 90 3 AS 3 5.21 17.2 8.17 166 286 4 AS 4 1.45 8.49 NA 170 309 5 AS 5 2.84 3.58 NA 466 374 6 AS 6 4.25 NA NA NA NA 7 AS 7 1.54 NA NA NA NA 8 AS 8 2.97 NA NA NA NA 9 AS 9 1.38 NA NA NA NA 10 BU 1 2.45 8.06 7.84 308 109 # ... with 35 more rows My analysis is pretty straightforward: mano_mod = manova(cbind(pH, bark_mm, comb_rugosity, whc, ret) ~ abbr, data = manova_df) > summary(mano_mod) Df Pillai approx F num Df den Df Pr(>F) abbr 4 1.5708 1.4226 20 44 0.1628 Residuals 12 I did not include my real data here, but they follow the same structure. The given data are far from being significant, whereas my actual data are! My question is solely regarding the many NAs in my data and if the test is accurate. (If anything is unclear, please ask.)
How to get conditional weighted means for several columns
For the following dataframe: eu <- structure(list(land = structure(c(1L, 4L, 5L, 12L, 9L, 13L, 16L, 18L, 27L, 10L, 25L, 21L, 28L, 19L, 8L, 26L, 6L, 3L, 15L, 14L, 11L, 17L, 20L, 23L, 24L, 2L, 22L, 7L), .Label = c("Belgie", "Bulgarije", "Cyprus", "Denemarken", "Duitsland", "Estland", "Europese Unie", "Finland", "Frankrijk", "Griekenland", "Hongarije", "Ierland", "Italie", "Letland", "Litouwen", "Luxemburg", "Malta", "Nederland", "Oostenrijk", "Polen", "Portugal", "Roemenie", "Slovenie", "Slowakije", "Spanje", "Tsjechie", "Verenigd Koninkrijk", "Zweden"), class = "factor"), `1979` = c(91.36, 47.82, 65.73, 63.61, 60.71, 85.65, 88.91, 58.12, 32.35, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 61.99), `1981` = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 81.48, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), `1984` = c(92.09, 52.38, 56.76, 47.56, 56.72, 82.47, 88.79, 50.88, 32.57, 80.59, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 58.98), `1987` = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 68.52, 72.42, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), `1989` = c(90.73, 46.17, 62.28, 68.28, 48.8, 81.07, 87.39, 47.48, 36.37, 80.03, 54.71, 51.1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 58.41), `1994` = c(90.66, 52.92, 60.02, 43.98, 52.71, 73.6, 88.55, 35.69, 36.43, 73.18, 59.14, 35.54, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 56.67), `1995` = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 41.63, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), `1996` = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 67.73, 57.6, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), `1999` = c(91.05, 50.46, 45.19, 50.21, 46.76, 69.76, 87.27, 30.02, 24, 70.25, 63.05, 39.93, 38.84, 49.4, 30.14, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 49.51), `2004` = c(90.81, 47.89, 43, 58.58, 42.76, 71.72, 91.35, 39.26, 38.52, 63.22, 45.14, 38.6, 37.85, 42.43, 39.43, 28.3, 26.83, 72.5, 48.38, 41.34, 38.5, 82.39, 20.87, 28.35, 16.97, NA, NA, 45.47), `2007` = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 29.22, 29.47, NA), `2009` = c(90.39, 59.54, 43.3, 58.64, 40.63, 65.05, 90.75, 36.75, 34.7, 52.61, 44.9, 36.78, 45.53, 45.97, 40.3, 28.2, 43.9, 59.4, 20.98, 53.7, 36.31, 78.79, 24.53, 28.33, 19.64, 38.99, 27.67, 43), inwoners = c(11161642, 5602628, 80523746, 4591087, 65578819, 59685227, 537039, 16779575, 63896071, 11062508, 46727890, 10487289, 9555893, 8451860, 5426674, 10516125, 1320174, 865878, 2971905, 2023825, 9908798, 421364, 38533299, 2058821, 5410836, 7284552, 20020074, 501403599), plicht = structure(c(1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("ja", "nee"), class = "factor")), .Names = c("land", "1979", "1981", "1984", "1987", "1989", "1994", "1995", "1996", "1999", "2004", "2007", "2009", "inwoners", "plicht"), row.names = c(NA, -28L), class = "data.frame") I need conditional column means. I can do that with: verplicht <- c("Europese Unie (stemplicht)", colMeans(eu[eu$plicht=="ja",c(2:13)], na.rm=TRUE), NA) vrij <- c("Europese Unie (geen stemplicht)", colMeans(eu[eu$plicht=="nee",c(2:13)], na.rm=TRUE), NA) eu2 <- rbind(eu, verplicht, vrij) However, I need weighted column means with country population (the inwoners column) as the weights. I tried to that with: verplicht <- c("Europese Unie (stemplicht)", lapply(eu[eu$plicht=="ja",c(2:13)], weighted.mean(x, eu[eu$plicht=="ja",14], na.rm=TRUE)), NA) but that resulted in the following error: Error in weighted.mean.default(x, eu[eu$plicht == "ja", 14], na.rm = TRUE) : 'x' and 'w' must have the same length I understand what the error-message is saying, but don't know how to solve this. Any suggestions?
The problem is with how you're using lapply. Here's the correct code: lapply(eu[eu$plicht=='ja',2:13], weighted.mean, eu[eu$plicht=='ja','inwoners'], na.rm=TRUE) lapply(eu[eu$plicht=='nee',2:13], weighted.mean, eu[eu$plicht=='nee','inwoners'], na.rm=TRUE) Notice how weighted.mean is used as an argument, rather than inside an anonymous function with x as an argument. You could equivalently do: lapply(eu[eu$plicht=='ja',2:13], function(x) weighted.mean(x, eu[eu$plicht=='ja','inwoners'], na.rm=TRUE)) lapply(eu[eu$plicht=='nee',2:13], function(x) weighted.mean(x, eu[eu$plicht=='nee','inwoners'], na.rm=TRUE)) But you're currently kind of mixing the two different ways of using lapply.
If inwoners is the population, then > (weights <- with(eu, inwoners/sum(inwoners))) # [1] 0.0111303968 0.0055869443 0.0802983327 0.0045782350 0.0653952416 # [6] 0.0595181478 0.0005355356 0.0167326033 0.0637172042 0.0110315403 # [11] 0.0465970828 0.0104579315 0.0095291428 0.0084282004 0.0054114829 # [16] 0.0104866868 0.0013164784 0.0008634541 0.0029635856 0.0020181596 # [21] 0.0098810599 0.0004201845 0.0384254312 0.0020530577 0.0053956892 # [26] 0.0072641601 0.0199640310 0.5000000000 and the weighted mean of the 2004 column, for example, is > weighted.mean(eu$`2004`, w = weights, na.rm = TRUE) # [1] 45.31782 To get the weighted mean of each of the year columns for when plicht == 'ja', > s <- subset(eu, plicht == "ja") > w2 <- weights[as.numeric(rownames(s))] > newDF <- do.call(rbind, lapply(2:13, function(i){ data.frame(wtMean.ja = weighted.mean(s[,i], w = w2, na.rm = TRUE)) })) > rownames(newDF) <- names(s)[2:13] > newDF # wtMean.ja # 1979 86.56735 # 1981 81.48000 # 1984 83.56127 # 1987 68.52000 # 1989 72.30636 # 1994 69.86950 # 1995 NaN # 1996 NaN # 1999 69.28708 # 2004 63.17060 # 2007 NaN # 2009 58.99465