Missing values in heatmap

Missing values in heatmap - r

I am working to generate a heatmap of the distribution of biological functional classes by tissue type for an analysis that I'm working on. I've successfully generated the heatmap using geom_tile, but would like to maintain the grid within the white space that is generated in the heatmap.
This white space is generated because there are no data in those comparisons (not NAs or zeros, but completely absent). Is it possible to either 1) edit the graphics to include the grid over the white space, or 2) edit the data frame to include NA's or zeros where those data are currently absent?
Here are my data:
structure(list(Tissue = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), .Label = c("FB",
"SOG", "MG", "HG", "MT"), class = "factor"), Transcript_Count = c(64,
36, 35, 42, 66, 122, 62, 40, 34, 46, 40, 36, 41, 37, 36, 37,
40, 35, 38, 40, 53, 37, 36, 36, 68, 40, 40, 116, 84, 149, 45,
72, 42, 65, 78, 37, 62, 35, 35, 43, 38, 152, 37, 60, 36, 66,
40, 60, 45, 35, 36, 35, 129, 193, 153, 420, 247, 357, 237, 343,
199, 484, 112, 464, 244, 150, 127, 151, 247, 152, 238, 246, 127,
127, 120, 182, 245, 128, 388, 279, 246, 139, 120, 120, 120, 146,
119, 143, 144, 133, 126, 133, 143, 143, 218, 131, 121, 120, 119,
124, 127, 119, 124, 124, 119, 224, 306, 387, 102, 108, 122, 136,
186, 373, 85, 151, 156, 83, 161, 127, 286, 135, 82, 180, 150,
158, 157, 76, 142, 95, 79, 81, 78, 79, 77, 183, 88, 99, 189,
356, 162, 150, 125, 110, 96, 98, 88, 91, 100, 93, 101, 150, 90,
88, 193, 96, 100, 336, 275, 410, 108, 225, 103, 187, 237, 90,
163, 131, 100, 92, 427, 90, 171, 88, 190, 102, 175, 109, 107,
80, 97, 87, 72, 256, 185, 144, 266, 233, 150, 83, 106, 133, 133,
133, 69, 217, 70, 134, 131, 101, 121, 58, 67, 65, 61, 58, 64,
64, 64, 65, 58, 57), GO.ID = structure(c(1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L,
19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L,
32L, 33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 42L, 43L, 44L,
45L, 46L, 47L, 48L, 49L, 50L, 51L, 52L, 19L, 9L, 10L, 6L, 37L,
35L, 8L, 29L, 39L, 42L, 53L, 30L, 34L, 31L, 22L, 49L, 25L, 21L,
1L, 46L, 43L, 36L, 12L, 48L, 5L, 41L, 28L, 32L, 7L, 40L, 23L,
15L, 18L, 33L, 38L, 20L, 47L, 26L, 54L, 11L, 27L, 17L, 44L, 13L,
14L, 51L, 3L, 24L, 16L, 52L, 2L, 45L, 50L, 29L, 6L, 42L, 9L,
39L, 8L, 37L, 35L, 30L, 10L, 1L, 34L, 49L, 25L, 21L, 28L, 7L,
31L, 32L, 48L, 46L, 5L, 27L, 44L, 4L, 47L, 40L, 17L, 33L, 20L,
1L, 2L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 13L, 14L, 16L, 17L,
19L, 20L, 21L, 22L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L,
33L, 34L, 35L, 36L, 37L, 39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L,
47L, 48L, 49L, 37L, 9L, 8L, 39L, 10L, 30L, 29L, 35L, 42L, 6L,
32L, 21L, 7L, 5L, 25L, 34L, 31L, 28L, 49L, 46L, 1L, 48L, 44L,
11L, 40L, 47L, 55L, 26L, 27L, 17L, 20L, 33L, 13L, 16L), .Label = c("GO:0006139",
"GO:0006351", "GO:0006355", "GO:0006508", "GO:0006725", "GO:0006807",
"GO:0006810", "GO:0007154", "GO:0007165", "GO:0009058", "GO:0009059",
"GO:0009889", "GO:0010467", "GO:0010468", "GO:0010556", "GO:0016070",
"GO:0018130", "GO:0019219", "GO:0019222", "GO:0019438", "GO:0019538",
"GO:0031323", "GO:0031326", "GO:0032774", "GO:0034641", "GO:0034645",
"GO:0034654", "GO:0043170", "GO:0044237", "GO:0044238", "GO:0044249",
"GO:0044260", "GO:0044271", "GO:0046483", "GO:0050794", "GO:0051171",
"GO:0051234", "GO:0051252", "GO:0051716", "GO:0055085", "GO:0060255",
"GO:0071704", "GO:0080090", "GO:0090304", "GO:0097659", "GO:1901360",
"GO:1901362", "GO:1901564", "GO:1901576", "GO:1903506", "GO:2000112",
"GO:2001141", "GO:0003008", "GO:0006811", "GO:0006259"), class = "factor")), row.names = c(NA,
-212L), class = "data.frame")
And my code to generate the heatmap:
(ggplot(All_Tissues_BP_Head, aes(Tissue, GO.ID)) +
Alex_Theme +
geom_tile(aes(fill = Transcript_Count), color = "black") +
scale_fill_gradient2(low = "white", mid = "blue", high= "black",
midpoint = mean(All_Tissues_BP$Transcript_Count)) +
scale_x_discrete(expand = c(0,0)) +
ggtitle(expression(atop(bold("Biological Processes")))) +
theme(legend.title = element_text(size=12),
legend.text = element_text(size=12)) +
theme(axis.text = element_text(size=12),
axis.title.y = element_blank(),
axis.title.x = element_text(size = 12)) +
labs(fill = "Transcript \n count"))

Use the complete function from tidyr to fill in missing factor combinations in your data.frame with NA.
Then you can use the na.value parameter in the color gradient to set the color.
library(ggplot2)
library(dplyr)
library(tidyr)
# z <- complete(All_Tissues_BP_Head, Tissue, GO.ID)
ggplot(complete(All_Tissues_BP_Head, Tissue, GO.ID), aes(Tissue, GO.ID)) +
geom_tile(aes(fill = Transcript_Count), color = "black") +
scale_fill_gradient2(low = "white", mid = "blue", high= "black",
midpoint = mean(All_Tissues_BP_Head$Transcript_Count), na.value="black") +
scale_x_discrete(expand = c(0,0)) +
ggtitle(expression(atop(bold("Biological Processes")))) +
theme(legend.title = element_text(size=12),
legend.text = element_text(size=12)) +
theme(axis.text = element_text(size=12),
axis.title.y = element_blank(),
axis.title.x = element_text(size = 12)) +
labs(fill = "Transcript \n count")

Related

scale_x_date returns error: how can I apply time frame to x-axis?

I am comparing the frequency of acute surgical procedures per week before and during the Covid-19 pandemic. I have a simple, linear regression in which each observation correspond to a week and is represented in lin.model$cons_week. There are 221 observation, corresponding to 221 successive weeks since 2017-01-02 (first monday of 2017) and up to 2021-04-05
mod <- fortify(lm(n ~ cons_week * corona, data = lin.model))
With n being number of surgical procedures and corona indicates which time period cons_week belongs to.
I have
mod %>%
ggplot(aes(y = .cooksd)) +
geom_col(data = filter(mod, corona == "Normal") %>% droplevels(),
aes(seq_along(.cooksd)),
color = "#6DBCC3", fill = alpha("#6DBCC3", .2)) +
geom_col(data = filter(mod, corona == "C19") %>% droplevels(),
aes(seq_along(.cooksd)+167),
color = "#8B3A62", fill = alpha("#8B3A62", .2)) +
geom_hline(yintercept = 4/nrow(lin.model), col = "black", lty = 2) +
geom_vline(xintercept = 167, color = "red", lty = 2) +
scale_y_continuous(name = "Cook's Distance")
Giving
However, I would like the x-axis to show some sort of sensible time line as each observation in Cook's correspond to one week.
I tried scale_date_x():
mod %>%
mutate(cons_week_dt = as.Date("2017-01-02") + cons_week*7) %>%
ggplot(aes(x = cons_week_dt, y = .cooksd)) +
geom_col(data = filter(mod, corona == "Normal") %>% droplevels(),
aes(seq_along(.cooksd)),
color = "#6DBCC3", fill = alpha("#6DBCC3", .2)) +
geom_col(data = filter(mod, corona == "C19") %>% droplevels(),
aes(seq_along(.cooksd)+167),
color = "#8B3A62", fill = alpha("#8B3A62", .2)) +
geom_hline(yintercept = 4/nrow(lin.model), col = "black", lty = 2) +
geom_vline(xintercept = 167, color = "red", lty = 2) +
scale_y_continuous(name = "Cook's Distance") +
scale_x_date(name = "",
date_breaks = "3 months", date_labels = "%B-%Y", expand = c(0.01, 0))
But that returns an error:
Error: Invalid input: date_trans works with objects of class Date only
lin.model <- structure(list(corona = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), .Label = c("Normal", "C19"), class = "factor"), cons_week = c(56,
6, 150, 87, 16, 100, 16, 149, 62, 38, 74, 3, 80, 64, 72, 80,
71, 25, 100, 159, 80, 72, 35, 14, 55, 162, 14, 4, 100, 34, 81,
59, 156, 84, 165, 129, 70, 140, 113, 13, 65, 76, 33, 7, 93, 40,
107, 72, 43, 123, 142, 65, 83, 119, 63, 116, 18, 48, 97, 6, 93,
79, 81, 158, 8, 45, 143, 114, 83, 96, 26, 91, 113, 22, 54, 126,
93, 39, 33, 132, 97, 126, 136, 145, 35, 30, 109, 160, 20, 74,
162, 26, 112, 145, 28, 22, 28, 153, 131, 136, 209, 203, 188,
180, 214, 180, 176, 203, 219, 172, 222, 212, 183, 200, 218, 208,
198, 169, 210, 222, 207, 184, 175, 185, 198, 192, 206, 195, 219,
193, 197, 217, 204, 193, 193, 182, 175, 169, 170, 208, 192, 181,
186, 186, 209, 212, 200, 194, 213, 178, 213, 212, 209, 205, 183,
206, 199, 181, 187, 174, 177, 215, 193, 207, 207, 204, 187, 195,
174, 184, 171, 218, 188, 181, 197, 180, 208, 203, 192, 173, 218,
173, 196, 185, 212, 201, 194, 221, 205, 210, 213, 174, 207, 181,
189, 179, 200, 196, 216, 201), n = c(9L, 14L, 11L, 15L, 19L,
12L, 19L, 20L, 12L, 17L, 9L, 13L, 7L, 6L, 12L, 7L, 11L, 15L,
12L, 8L, 7L, 12L, 15L, 13L, 10L, 11L, 13L, 20L, 12L, 10L, 11L,
11L, 16L, 17L, 13L, 12L, 15L, 6L, 13L, 14L, 14L, 16L, 25L, 15L,
11L, 19L, 22L, 12L, 18L, 18L, 12L, 14L, 11L, 18L, 14L, 11L, 14L,
14L, 15L, 14L, 11L, 15L, 11L, 15L, 16L, 14L, 11L, 12L, 11L, 18L,
19L, 16L, 13L, 10L, 14L, 19L, 11L, 12L, 25L, 9L, 15L, 19L, 15L,
19L, 15L, 17L, 11L, 11L, 17L, 9L, 11L, 19L, 16L, 19L, 17L, 10L,
17L, 14L, 12L, 15L, 15L, 12L, 14L, 10L, 13L, 10L, 9L, 12L, 18L,
15L, 20L, 17L, 13L, 10L, 14L, 13L, 17L, 15L, 14L, 20L, 16L, 10L,
11L, 9L, 17L, 15L, 15L, 9L, 18L, 12L, 14L, 10L, 16L, 12L, 12L,
16L, 11L, 15L, 8L, 13L, 15L, 13L, 19L, 19L, 15L, 17L, 10L, 8L,
10L, 12L, 10L, 17L, 15L, 19L, 13L, 15L, 17L, 13L, 15L, 13L, 11L,
16L, 12L, 16L, 16L, 16L, 15L, 9L, 13L, 10L, 11L, 14L, 14L, 13L,
14L, 10L, 13L, 12L, 15L, 22L, 14L, 22L, 22L, 9L, 17L, 15L, 8L,
9L, 19L, 14L, 10L, 13L, 16L, 13L, 12L, 15L, 10L, 22L, 14L, 15L
)), row.names = c(NA, -200L), groups = structure(list(corona = structure(1:2, .Label = c("Normal",
"C19"), class = "factor"), .rows = structure(list(1:100, 101:200), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -2L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))

The main issue with your code is that you mapped seq_along(.cooksd) on x. To solve this issue simply prepare your data as a separate step so that all layers make use of the same data. Also, there is no need for subsetting and two geom_col. Only one is needed while mapping your var corona on color and fill and setting the colors via scale_color/fill_manual
library(ggplot2)
library(dplyr)
mod <- fortify(lm(n ~ cons_week * corona, data = lin.model))
mod1 <- mod %>%
mutate(cons_week_dt = as.Date("2017-01-02") + cons_week*7)
cols <- c(Normal = "#6DBCC3", C19 = "#8B3A62")
ggplot(mod1, aes(x = cons_week_dt, y = .cooksd)) +
geom_col(aes(color = corona, fill = corona)) +
geom_hline(yintercept = 4/nrow(lin.model), col = "black", lty = 2) +
geom_vline(xintercept = 167, color = "red", lty = 2) +
scale_color_manual(values = cols) +
scale_fill_manual(values = alpha(cols, .2)) +
scale_y_continuous(name = "Cook's Distance") +
scale_x_date(name = "",
date_breaks = "3 months", date_labels = "%B-%Y", expand = c(0.01, 0)) +
guides(color = "none", fill = "none")

Batch processing and export of a list of CSV files in R

I have 300 CSV files with same structure in a folder based on separate village names. I need to read each file individually, process those, and export output files in another folder with respective village names (e.g., 'village name'_score).
Here are the data for an example village file...
structure(list(ID_GC = structure(1:51, .Label = c("492K", "494K",
"497K", "498K", "499K", "500K", "501K", "502K", "503K", "504K",
"506K", "507K", "508K", "509K", "510K", "511K", "512K", "513K",
"514K", "516K", "517K", "518K", "519K", "522K", "523K", "524K",
"526K", "527K", "528K", "530K", "531K", "532K", "533K", "534K",
"535K", "536K", "537K", "538K", "539K", "540K", "541K", "542K",
"543K", "544K", "545K", "546K", "547K", "548K", "550K", "551K",
"552K"), class = "factor"), Lat = c(23.78107, 23.78115, 23.78122,
23.78123, 23.78125, 23.78081, 23.78096, 23.78062, 23.78068, 23.78071,
23.78075, 23.78043, 23.78021, 23.77937, 23.77985, 23.77981, 23.77995,
23.77987, 23.7799, 23.7796, 23.77944, 23.77934, 23.77937, 23.77906,
23.77899, 23.77907, 23.77889, 23.77898, 23.77863, 23.77865, 23.77855,
23.77852, 23.77843, 23.77806, 23.77824, 23.77809, 23.7781, 23.77797,
23.77788, 23.77786, 23.77809, 23.77815, 23.77771, 23.77757, 23.77772,
23.77752, 23.7774, 23.7772, 23.77869, 23.78084, 23.78178), Long = c(90.65016,
90.64968, 90.6497, 90.64969, 90.64972, 90.64996, 90.64987, 90.64989,
90.64924, 90.64921, 90.65, 90.64998, 90.6494, 90.64989, 90.64978,
90.64973, 90.64952, 90.64958, 90.64925, 90.64935, 90.6492, 90.64922,
90.64919, 90.64928, 90.64937, 90.64887, 90.64919, 90.64891, 90.64914,
90.64903, 90.64907, 90.6491, 90.64868, 90.6491, 90.64853, 90.64862,
90.64851, 90.64852, 90.64865, 90.64865, 90.64878, 90.64878, 90.64866,
90.64859, 90.64844, 90.64839, 90.64858, 90.64861, 90.64922, 90.64994,
90.64925), Village = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "Abdullapur", class = "factor"),
Depth_m = c(18, 18, 18, 210, 18, 31.5, 13.5, 15, 13.5, 21,
13.5, 18, 15, 240, 24, 13.5, 19.5, 33, 156, 14.4, 18, 21,
13.5, 18, 18, 51, 48, 54, 67.5, 69, 69, 66, 66, 21, 60, 66,
54, 31.5, 21, 210, 66, 12, 54, 27, 219, 18, 18, 18, 18, 18,
21), As_ug_L = c(68L, 68L, 68L, 2L, 68L, 306L, 129L, 129L,
20L, 68L, 188L, 129L, 68L, 2L, 68L, 68L, 129L, 188L, 2L,
2L, 68L, 37L, 20L, 306L, 306L, 20L, 306L, 20L, 2L, 2L, 2L,
2L, 2L, 306L, 2L, 2L, 2L, 306L, 306L, 2L, 2L, 306L, 2L, 306L,
20L, 306L, 68L, 68L, 306L, 68L, 20L)), class = "data.frame", row.names = c(NA,
-51L))
And another dataset ("dtw_BG") that will be needed for the calculation of all villages...
structure(list(ID_GC = structure(c(10L, 11L, 12L, 13L, 14L, 8L,
9L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L,
27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 1L, 2L, 3L, 4L, 5L, 6L,
7L), .Label = c("1002F", "1008F", "1016F", "1029F", "1051F",
"1053F", "1058F", "1548D", "1561D", "498K", "509K", "514K", "540K",
"545K", "559K", "560K", "569K", "571K", "597K", "601K", "614K",
"819F", "829F", "933F", "934F", "951F", "957F", "958F", "959F",
"960F", "964F", "973F", "982F", "998F"), class = "factor"), Lat = c(23.78123,
23.77937, 23.7799, 23.77786, 23.77772, 23.77439336, 23.77204886,
23.77484, 23.775, 23.77528, 23.77492, 23.77521, 23.77593, 23.7757,
23.78494, 23.78473, 23.78385611, 23.78395451, 23.78426992, 23.78374538,
23.78377154, 23.78360725, 23.78340944, 23.78362259, 23.78272036,
23.78307399, 23.78269739, 23.78252464, 23.78279102, 23.78131262,
23.78149057, 23.77867098, 23.77828323, 23.78592929), Long = c(90.64969,
90.64989, 90.64925, 90.64865, 90.64844, 90.65543457, 90.65292302,
90.65158, 90.65192, 90.65219, 90.65232, 90.65363, 90.65356, 90.65483,
90.65025, 90.65238, 90.64900976, 90.64933908, 90.65082989, 90.64891814,
90.64902199, 90.64910447, 90.64933699, 90.6488857, 90.64921562,
90.64848103, 90.64799873, 90.64826494, 90.64738669, 90.64781684,
90.64612672, 90.64499055, 90.64476985, 90.6499865), Village = structure(c(1L,
1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L), .Label = c("Abdullapur", "Chauthar Kanda", "Nagra Para Faitadi",
"Nowa Para"), class = "factor"), Depth_m = c(210, 240, 156, 210,
219, 225, 195, 299.7, 299.7, 240, 240, 234, 240, 105, 165, 180,
180, 225, 180, 210, 195, 201, 180, 195, 210, 210, 195, 180, 225,
180, 108, 210, 225, 240), As_ug_L = c(2L, 2L, 2L, 2L, 20L, 2L,
2L, 2L, 20L, 2L, 2L, 7L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L)), class = "data.frame", row.names = c(NA,
-34L))
I need to process all the villages but I am not certain how to loop them in. So far I was able to read all the individual village files using "readr" package.
library(readr)
a <- list.files(path = "/Users/......",
pattern = "*.csv", full.names = T)
Here is the code I'm using for an individual village:
dtw_BG<- read.csv('/Users/...../dtw_BG.csv',header=TRUE)
gw<-read.csv('/Users/....../Abdullapur.csv',header=TRUE)
stw = gw[gw$Depth_m <= 90,]
stw_R = gw[gw$Depth_m <= 90 & gw$As_ug_L > 50,]
itw = gw[gw$Depth_m >= 45 & gw$Depth_m <= 90,]
itw_10 = gw[gw$Depth_m >= 45 & gw$Depth_m <= 90 & gw$As_ug_L <= 10,]
p<-stw [,c(3,2)]
R<-stw_R[,c(3,2)]
ITW<-itw[,c(3,2)]
ITW_10<- itw_10[,c(3,2)]
BG<-dtw_BG[,c(3,2)]
dist_R<- lapply(1:length(p[[1]]), function (i) distGeo (R, p[i,]))
dist_R<-lapply(1:length(p[[1]]), function (i) data.frame(R, dist_R[[i]]))
dist_R100<-lapply(1:length(p[[1]]),function (i) dist_R[[i]][dist_R[[i]][,3] <= 100,])
maxscore<- lapply(1:length(p[[1]]), function(i) nrow (dist_R100[[i]]))
maxscore<-unlist(maxscore)
dist_ITW<- lapply(1:length(p[[1]]), function (i) distGeo (ITW, p[i,]))
dist_ITW<-lapply(1:length(p[[1]]), function (i) data.frame(ITW, dist_ITW[[i]]))
dist_ITW100<-lapply(1:length(p[[1]]),function (i) dist_ITW[[i]][dist_ITW[[i]][,3] <= 100,])
count_itw<- lapply(1:length(p[[1]]), function(i) nrow (dist_ITW100[[i]]))
count_itw<-unlist(count_itw)
if (nrow(ITW_10)==0) {
count_itw10<- rep(0, length(maxscore))
} else {
dist_ITW10<- lapply(1:length(p[[1]]), function (i) distGeo (ITW_10, p[i,]))
dist_ITW10<-lapply(1:length(p[[1]]), function (i) data.frame(ITW_10, dist_ITW10[[i]]))
dist_ITW10_100<-lapply(1:length(p[[1]]),function (i) dist_ITW10[[i]][dist_ITW10[[i]][,3] <= 100,])
count_itw10<- lapply(1:length(p[[1]]), function(i) nrow (dist_ITW10_100[[i]]))
count_itw10<-unlist(count_itw10)
}
dist_BG<- lapply(1:length(p[[1]]), function (i) distGeo (BG, p[i,]))
dist_BG<-lapply(1:length(p[[1]]), function (i) data.frame(BG, dist_BG[[i]]))
dtw<-lapply(1:length(p[[1]]), function(i) {
lapply(1: length(maxscore), function(j) {
min(distGeo( c(dist_R100[[i]][j,1], dist_R100[[i]][j,2]), dist_BG[[i]]))
}
)
}
)
dtw<-unlist(dtw)
dtw<-split(dtw, (0:length(dtw) %/% length(p[[1]])))
dtw <- dtw[-length (dtw)]
count<-lapply(1:length(dtw), function(i) length(subset(dtw[[i]], dtw[[i]]<=100)))
count<-unlist(count)
score<-maxscore-count
abc<-cbind (stw, maxscore, count, score, count_itw, count_itw10)
abc<- data.frame (abc)
write.csv (abc, "/Users/..../Output/Abdullapur_score.csv", row.names = F)
The output for the provided village should look like
structure(list(ID_GC = structure(c(1L, 2L, 3L, 5L, 6L, 7L, 8L,
9L, 10L, 11L, 12L, 13L, 15L, 16L, 17L, 18L, 20L, 21L, 22L, 23L,
24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L,
37L, 38L, 39L, 41L, 42L, 43L, 44L, 46L, 47L, 48L, 49L, 50L, 51L
), .Label = c("492K", "494K", "497K", "498K", "499K", "500K",
"501K", "502K", "503K", "504K", "506K", "507K", "508K", "509K",
"510K", "511K", "512K", "513K", "514K", "516K", "517K", "518K",
"519K", "522K", "523K", "524K", "526K", "527K", "528K", "530K",
"531K", "532K", "533K", "534K", "535K", "536K", "537K", "538K",
"539K", "540K", "541K", "542K", "543K", "544K", "545K", "546K",
"547K", "548K", "550K", "551K", "552K"), class = "factor"), Lat = c(23.78107,
23.78115, 23.78122, 23.78125, 23.78081, 23.78096, 23.78062, 23.78068,
23.78071, 23.78075, 23.78043, 23.78021, 23.77985, 23.77981, 23.77995,
23.77987, 23.7796, 23.77944, 23.77934, 23.77937, 23.77906, 23.77899,
23.77907, 23.77889, 23.77898, 23.77863, 23.77865, 23.77855, 23.77852,
23.77843, 23.77806, 23.77824, 23.77809, 23.7781, 23.77797, 23.77788,
23.77809, 23.77815, 23.77771, 23.77757, 23.77752, 23.7774, 23.7772,
23.77869, 23.78084, 23.78178), Long = c(90.65016, 90.64968, 90.6497,
90.64972, 90.64996, 90.64987, 90.64989, 90.64924, 90.64921, 90.65,
90.64998, 90.6494, 90.64978, 90.64973, 90.64952, 90.64958, 90.64935,
90.6492, 90.64922, 90.64919, 90.64928, 90.64937, 90.64887, 90.64919,
90.64891, 90.64914, 90.64903, 90.64907, 90.6491, 90.64868, 90.6491,
90.64853, 90.64862, 90.64851, 90.64852, 90.64865, 90.64878, 90.64878,
90.64866, 90.64859, 90.64839, 90.64858, 90.64861, 90.64922, 90.64994,
90.64925), Village = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "Abdullapur", class = "factor"),
Depth_m = c(18, 18, 18, 18, 31.5, 13.5, 15, 13.5, 21, 13.5,
18, 15, 24, 13.5, 19.5, 33, 14.4, 18, 21, 13.5, 18, 18, 51,
48, 54, 67.5, 69, 69, 66, 66, 21, 60, 66, 54, 31.5, 21, 66,
12, 54, 27, 18, 18, 18, 18, 18, 21), As_ug_L = c(68L, 68L,
68L, 68L, 306L, 129L, 129L, 20L, 68L, 188L, 129L, 68L, 68L,
68L, 129L, 188L, 2L, 68L, 37L, 20L, 306L, 306L, 20L, 306L,
20L, 2L, 2L, 2L, 2L, 2L, 306L, 2L, 2L, 2L, 306L, 306L, 2L,
306L, 2L, 306L, 306L, 68L, 68L, 306L, 68L, 20L), maxscore = c(10L,
11L, 11L, 11L, 12L, 12L, 16L, 13L, 12L, 12L, 16L, 13L, 8L,
10L, 9L, 10L, 9L, 10L, 10L, 10L, 7L, 7L, 5L, 7L, 6L, 9L,
9L, 9L, 8L, 9L, 9L, 9L, 9L, 8L, 8L, 8L, 9L, 9L, 8L, 8L, 8L,
8L, 6L, 7L, 12L, 3L), count = c(10L, 11L, 11L, 11L, 12L,
12L, 16L, 13L, 12L, 12L, 16L, 13L, 8L, 10L, 9L, 10L, 9L,
9L, 9L, 9L, 6L, 6L, 4L, 6L, 5L, 8L, 8L, 8L, 7L, 8L, 8L, 8L,
8L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 6L, 6L, 12L, 3L),
score = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L,
0L, 1L, 0L, 0L), count_itw = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 3L, 6L, 7L, 7L, 8L, 8L,
9L, 10L, 10L, 12L, 12L, 12L, 12L, 13L, 11L, 13L, 10L, 10L,
10L, 10L, 12L, 12L, 6L, 6L, 5L, 5L, 2L, 12L, 0L, 0L), count_itw10 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 3L, 4L, 4L, 5L, 5L, 6L, 7L, 7L, 9L, 9L, 9L, 9L, 10L,
10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 6L, 6L, 5L, 5L, 2L,
9L, 0L, 0L)), class = "data.frame", row.names = c(1L, 2L,
3L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 15L, 16L, 17L, 18L,
20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L,
33L, 34L, 35L, 36L, 37L, 38L, 39L, 41L, 42L, 43L, 44L, 46L, 47L,
48L, 49L, 50L, 51L))
How can I export each village file with respect to its name?
thank you in advance :)

Simply generalize your process in a defined method that receives the village file name as parameter. Then build a list of data frames by iterating through file names and calling your method:
# COMMON VARIABLES
output_path <- "/Users/..../Output/"
dtw_BG <- read.csv('/Users/...../dtw_BG.csv', header=TRUE)
BG <- dtw_BG[,c(3,2)]
# OUTPUT CSV AND RETURN DATA FRAME
calc_score <- function(village_file) {
gw <- read.csv(village_file, header=TRUE)
#... REST OF CODE
write.csv(abc, paste0(output_Path, stw$Village[[1]], ".csv"), row.names = FALSE)
return(abc)
}
# PASS FILE NAMES ITERATIVELY TO BULLD LIST OF DFs (WITH EACH CSV)
v_files <- list.files(path = "/path/to/inputs", pattern = "*.csv",
full.names = TRUE)
df_list <- lapply(v_files, calc_score)
By the way, possibly much of your code can be tightened up as distGeo can receive a matrix of Lon and Lat coordinates. Also, consider a cross join merge (i.e., all pairwise matches) between each pairing of data frames and p to cut down on the repetitious lapply calls. For code maintainability, try to use column names instead of numbers.
NOTE: Below needs testing on full data and is shown as example.
calc_score <- function(village_file) {
gw <- read.csv(village_file, header=TRUE)
### DATA FRAME SUBSETS
stw <- gw[gw$Depth_m <= 90,]
p <- stw[, c("Long", "Lat")]
R <- gw[gw$Depth_m <= 90 & gw$As_ug_L > 50, c("Long", "Lat")]
ITW <- gw[gw$Depth_m >= 45 & gw$Depth_m <= 90, c("Long", "Lat")]
ITW_10 <- gw[gw$Depth_m >= 45 & gw$Depth_m <= 90 & gw$As_ug_L <= 10, c("Long", "Lat")]
### MAX SCORE CALCULATION
cj <- merge(R, p, by=NULL, suffixes=c("", "_")) # CROSS JOIN OF ALL ROWS BETWEEN DFs
dist_R <- transform(cj, Distance = distGeo(cj[c("Long", "Lat")], cj[c("Long_", "Lat_")]))
dist_R100 <- subset(dist_R, Distance <= 100)
maxscore <- aggregate(cbind(Score=Distance) ~ Long_ + Lat_, dist_R100, FUN=length)$Score
### COUNT ITW100 CALCULATION
cj <- merge(ITW, p, by=NULL, suffixes=c("", "_")) # CROSS JOIN OF ALL ROWS BETWEEN DFs
dist_ITW <- transform(cj, Distance = distGeo(cj[c("Long", "Lat")], cj[c("Long_", "Lat_")]))
dist_ITW100 <- subset(dist_ITW, Distance <= 100)
count_itw <- aggregate(cbind(Count=Distance) ~ Long_ + Lat_, dist_ITW100, FUN=length)$Count
### COUNT ITW10 CALCULATION
if (nrow(ITW_10)==0) {
count_itw10 <- rep(0, length(maxscore))
} else {
cj <- merge(IT_10, p, by=NULL, suffixes=c("", "_")) # CROSS JOIN OF ALL ROWS BETWEEN DFs
dist_ITW10 <- transform(cj, Distance = distGeo(cj[c("Long", "Lat")], cj[c("Long_", "Lat_")]))
dist_ITW10_100 <- subset(dist_ITW10, Distance <= 100)
count_itw10 <- aggregate(cbind(Count=Distance) ~ Long_ + Lat_, dist_ITW10_100, FUN=length)$Count
}
### MINIMUM DISTANCE
cj <- merge(BG, p, by=NULL, suffixes=c("", "_")) # CROSS JOIN OF ALL ROWS BETWEEN DFs
dist_BG <- transform(cj, Distance = distGeo(cj[c("Long", "Lat")], cj[c("Long_", "Lat_")]))
mdf <- merge(dist_R100, dist_BG, by=c("Long_", "Lat_"),
suffixes=c("", "_")) # MERGE AT p LEVEL
dtw <- transform(mdf, Distance = distGeo(mdf[c("Long", "Lat")], mdf[c("Long_", "Lat_")]))
dtw <- aggregate(Distance ~ Long + Lat, dtw, FUN=min)$Distance
### SCORE CALCULATION
dtw <- unlist(dtw)
dtw <- split(dtw, (0:length(dtw) %/% length(p[[1]])))
dtw <- dtw[-length (dtw)]
count <- sapply(dtw, function(d) length(d[d<=100]))
score <- maxscore - count
### FINAL DATA FRAME
village_df <- cbind.data.frame(stw, maxscore, count, score, count_itw, count_itw10)
write.csv(village_df, paste0(output_Path, village_df$Village[[1]], ".csv"), row.names = FALSE)
return(village_df)
}

Why are the fitted values from this linear model not continuous?

I would like to estimate the length of individuals based on their age and sex, using OLS in R. To that purpose I built the following model: m1 <- lm(length ~ age + sex, data = data.frame). Next, I created some simple residual plots by running:
op <- par(mfrow = c(2,2))
plot(resid(m1) ~ fitted(m1))
plot(resid(m1) ~ data.frame$age)
plot(resid(m1) ~ data.frame$sex)
qqnorm(resid(m1)); qqline(resid(m1))
par(op)
yielding this plot:
Strange enough, the fitted values do not seem to have the range [165,180], but rather [165,170] ∪ [175,180] (top left plot). I do not understand why this is happening.
Below some sample data producing the plots above:
structure(list(length = c(173, 170, 172, 160, 162.5, 180, 179.5,
175, 168, 186.5, 163.5, 170.5, 160, 175.5, 186.5, 176.5, 168,
180.5, 179, 167, 183, 188.5, 176, 165, 170, 171, 176, 172, 187,
189, 180, 175.5, 162.5, 187, 164, 177, 170.5, 159.5, 161.5, 167,
178, 180.5, 168.5, 162, 171, 173, 171.5, 174.5, 177, 158, 175,
170, 183.5, 166, 174.5, 174, 176, 165, 163.5, 171.5, 161, 173,
165, 186, 171, 164.5, 182.5, 156.5, 156, 175, 168.5, 195, 164,
167.5, 168, 165.5, 172.5, 167, 175, 190, 170.5, 166, 155, 179.5,
175, 185, 174, 158.5, 172.5, 172.5, 173, 177, 161.5, 173.5, 159,
181, 176, 181.5, 167.5, 170.5), age = c(31.0965092402464, 67.7481177275838,
60.9062286105407, 54.776180698152, 57.8316221765914, 42.0287474332649,
47.1786447638604, 51.315537303217, 68.0876112251882, 32.3613963039014,
52.1259411362081, 50.7652292950034, 53.6947296372348, 64.6242299794661,
66.9733059548255, 66.8829568788501, 63.668720054757, 73.533196440794,
57.7659137577002, 43.7262149212868, 51.2416153319644, 30.7953456536619,
73.0403832991102, 52.2984257357974, 46.2614647501711, 35.7618069815195,
74.0670773442847, 35.6878850102669, 43.3894592744695, 65.0458590006845,
55.9671457905544, 71.2306639288159, 58.5653661875428, 40.0520191649555,
39.9698836413415, 44.0109514031485, 34.4722792607803, 47.5400410677618,
51.8822724161533, 46.9596167008898, 39.0143737166324, 49.0349075975359,
39.3812457221081, 48.2518822724162, 37.0376454483231, 30.425735797399,
31.5838466803559, 74.9459274469541, 46.3353867214237, 56.0602327173169,
54.4476386036961, 58.4120465434634, 47.64681724846, 39.047227926078,
45.2183436002738, 48.0246406570842, 41.5140314852841, 61.0732375085558,
52.2600958247776, 62.9760438056126, 70.715947980835, 70.5735797399042,
40.2436687200548, 35.0198494182067, 41.1772758384668, 57.2210814510609,
64.2710472279261, 59.6221765913758, 63.0088980150582, 48.5366187542779,
30.0369609856263, 48.8898015058179, 49.7741273100616, 54.7624914442163,
61.284052019165, 37.0102669404517, 58.4695414099932, 55.3483915126626,
39.4579055441478, 49.3333333333333, 37.9712525667351, 57.388090349076,
70.8199863107461, 37.0212183436003, 51.3675564681725, 48.3860369609856,
35.895961670089, 39.5208761122519, 37.4209445585216, 46.8692676249144,
65.3826146475017, 51.9425051334702, 33.2594113620808, 55.1156741957563,
33.9493497604381, 33.2895277207392, 42.0369609856263, 29.4976043805613,
54.9514031485284, 36.2327173169062), sex = structure(c(1L, 2L,
2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L,
1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L,
2L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L,
1L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L,
1L, 2L), .Label = c("0", "1"), class = "factor")), row.names = c(1L,
2L, 3L, 4L, 5L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 17L,
18L, 19L, 20L, 21L, 22L, 23L, 25L, 26L, 27L, 28L, 29L, 30L, 32L,
33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 42L, 43L, 44L, 45L,
47L, 48L, 49L, 50L, 51L, 53L, 54L, 56L, 57L, 58L, 59L, 60L, 62L,
63L, 64L, 65L, 66L, 67L, 68L, 70L, 73L, 74L, 75L, 76L, 77L, 78L,
79L, 81L, 82L, 85L, 86L, 87L, 88L, 90L, 92L, 94L, 95L, 96L, 97L,
98L, 99L, 100L, 102L, 103L, 104L, 105L, 106L, 107L, 109L, 110L,
111L, 112L, 113L, 114L, 115L, 116L, 117L, 118L, 119L), class = "data.frame")
Does anyone know the flaw?

Following #Roland's comment: the plot shows that the within-group variation of the predicted values is much smaller than the observed values.
library(ggplot2); theme_set(theme_bw())
lm1 <- lm(length~age+sex,data=dd)
pp <- expand.grid(age=30:75,sex=factor(0:1))
pp$length <- predict(lm1,newdata=pp)
ggplot(dd,aes(age,length,colour=sex))+
geom_point()+
geom_point(data=pp,shape=2)

ggplot group by fill and show mean

I'm working on a heatmap and following along this tutorial:
https://www.r-graph-gallery.com/283-the-hourly-heatmap/
To save a click, here's the code block to reproduce:
library(ggplot2)
library(dplyr) # easier data wrangling
library(viridis) # colour blind friendly palette, works in B&W also
library(Interpol.T) # will generate a large dataset on initial load
library(lubridate) # for easy date manipulation
library(ggExtra) # because remembering ggplot theme options is beyond me
library(tidyr)
data<- data(Trentino_hourly_T,package = "Interpol.T")
names(h_d_t)[1:5]<- c("stationid","date","hour","temp","flag")
df<- tbl_df(h_d_t) %>%
filter(stationid =="T0001")
df<- df %>% mutate(year = year(date),
month = month(date, label=TRUE),
day = day(date))
df$date<-ymd(df$date) # not necessary for plot but
#useful if you want to do further work with the data
#cleanup
rm(list=c("h_d_t","mo_bias","Tn","Tx",
"Th_int_list","calibration_l",
"calibration_shape","Tm_list"))
#create plotting df
df <-df %>% select(stationid,day,hour,month,year,temp)
Then a heatmap is made:
p <-ggplot(df,aes(day,hour,fill=temp))+
geom_tile(color= "white",size=0.1) +
scale_fill_viridis(name="Hrly Temps C",option ="C")
p <-p + facet_grid(year~month)
p <-p + scale_y_continuous(trans = "reverse", breaks = unique(df$hour))
So far so good, I can recreate this. However my own dataset is website visit data at the visit level, so many visits in a given day and hour. In addition to visits I also have a timeOnPage metric.
Sample of data below with dput.
I wouldlike to heatmap the average hourly visits or timeOnPage. Here's what I tried.
Sample of my data:
> dput(sam)
structure(list(Day = structure(c(4L, 4L, 4L, 5L, 3L, 2L, 3L,
6L, 2L, 2L, 4L, 2L, 3L, 3L, 6L, 1L, 4L, 2L, 3L, 5L, 2L, 5L, 4L,
2L, 5L, 2L, 7L, 5L, 6L, 2L, 2L, 6L, 4L, 6L, 2L, 2L, 2L, 5L, 5L,
2L, 6L, 5L, 3L, 5L, 3L, 2L, 6L, 4L, 2L, 5L, 2L, 5L, 4L, 2L, 6L,
2L, 7L, 2L, 2L, 2L, 5L, 6L, 3L, 2L, 3L, 4L, 4L, 3L, 6L, 2L, 5L,
3L, 4L, 4L, 3L, 2L, 5L, 5L, 5L, 3L, 5L, 2L, 4L, 5L, 5L, 2L, 3L,
6L, 2L, 2L, 5L, 4L, 6L, 7L, 3L, 3L, 4L, 4L, 2L, 6L), .Label = c("Sun",
"Mon", "Tues", "Wed", "Thurs", "Fri", "Sat"), class = c("ordered",
"factor")), Hour = c(18L, 7L, 3L, 22L, 11L, 11L, 9L, 16L, 16L,
13L, 18L, 18L, 10L, 19L, 7L, 13L, 18L, 14L, 10L, 20L, 17L, 6L,
21L, 15L, 18L, 7L, 12L, 10L, 16L, 14L, 18L, 13L, 17L, 10L, 19L,
20L, 14L, 16L, 10L, 9L, 16L, 9L, 8L, 13L, 17L, 17L, 11L, 15L,
22L, 17L, 18L, 17L, 7L, 19L, 12L, 2L, 12L, 15L, 7L, 17L, 17L,
18L, 13L, 10L, 19L, 9L, 13L, 13L, 17L, 21L, 23L, 4L, 17L, 12L,
12L, 9L, 17L, 19L, 7L, 4L, 5L, 17L, 6L, 23L, 3L, 14L, 19L, 13L,
7L, 11L, 9L, 13L, 9L, 19L, 11L, 5L, 20L, 20L, 19L, 11L), sessionID = c("1508980591045.l027p6mt",
"1510155616668.57i2wj1", "1510140439620.qu19kyo", "1510296404412.xasqfwqd10v1qdtl6jemi",
"1510082622485.szj2ja1e", "1511204933263.mq9bvi0d", "1511285142249.vp2fyfd9",
"1510965282725.x04h1dko", "1508801295434.e056cpef", "1508790369346.ly63bjgr",
"1509585154520.3usd036k", "1511834881064.e6f5evp", "1509471114265.2u807dwo",
"1507688054076.9dls0jk", "1509721031589.ho125mpb", "1510521845178.99j1ibkr",
"1510194555297.ioepfjgr", "1508793469455.hkc3xwa8", "1511288175700.62n5oc5",
"1510287319653.7ye9sjc", "1511227016523.yyn1of99", "1511448209341.1u5vir5p",
"1510205972493.qvu4ev7o", "1510615247987.swxhwct", "1508463701266.p52sdjzp",
"1510588449881.d6ffruv9", "1507404213416.rovwmmge", "1510857718956.2z57w2vr",
"1510360661780.19hznp3m78pvi", "1511820500742.48cyvo2a", "1508809029952.up0wqq5h",
"1508533120441.gdvhacjr7jswiquwuyp66r", "1509583258224.j8krac0sz5kx8pxohl4n29",
"1511549442901.5vm7na1l", "1508811367845.7b36epqk", "1509421407861.om0ydylt",
"1508794534361.p3gcoa0e", "1510877729807.viad220f", "1511460355269.omwvd00l",
"1508775703610.usuk2akm", "1510964376869.7e2crw9d", "1510247098808.np9ia23",
"1508860753512.3z4182b", "1510868797935.3nmpvkri", "1510105270807.4evhpys",
"1511831565084.27izf13f", "1510340973580.l9qj5drou5wmi", "1508364715184.14l4ikj",
"1509426566404.9qnp0m3", "1510275972333.hhqu0exc", "1510625679744.jk3vvt1v",
"1510881839700.c34skful", "1511365134270.57thqyir", "1509416741055.1f2cnmrp",
"1509738404263.8ajwpij", "1510570338116.h9a5j88", "1511640706961.qw8q1eh",
"1510011913201.eqd54kw", "1508769010911.wrpb329", "1508803518777.56b2ej2l",
"1509670743316.yhncp17j", "1511576965410.y47g0wgj", "1508876390209.wem8i3lh",
"1508779846415.hyx8qar", "1511322782502.s835px9", "1509554323957.osxgi0em",
"1510176829762.jncm9xwb", "1509482328620.sqdbob0u", "1508545652936.a5hqcmp1fw29",
"1508817816447.6mbdldxb", "1510297785623.33i6yhko", "1508843299131.3m26sqf5",
"1510191633431.cl5fh9ik", "1509565114633.bd5yrkf5", "1510690660714.818yxn5o",
"1507567660773.ybpbfgn", "1509667501973.1a9f9pyp", "1509674601865.yqvmcclv",
"1511450423709.s149r25q", "1511267096892.n5u1d0nv", "1509624499459.u57lgtt8",
"1510019204298.ka4w9kfh", "1511362131909.t26h6ig", "1510904968660.eowoea2q",
"1510225256391.4dk073ej", "1510006654569.reo2eili", "1509501692686.ng48bwnz",
"1509741958143.bxbf325r", "1508770633217.33ymrfgc", "1511810438817.zcgpr6vj",
"1510852180447.wywsj7f", "1510176833767.nev0iaec", "1509727547082.53van2sr",
"1507430914148.niu297m", "1508868705810.akd7r18h", "1510060231388.mz9ojf6g",
"1509592760232.qtrlxye8", "1509592651211.1r82ucw4", "1508812928318.f3st4004",
"1509734102140.leol1dnw"), uniquePageviews = c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), timeOnPage = c(359, 149, 69, 146, 147, 119, 168, 69, 29, 0,
1542, 148, 242, 49, 457, 175, 175, 97, 79, 12, 0, 1141, 150,
236, 74, 128, 23, 147, 172, 223, 225, 88, 69, 156, 0, 49, 110,
150, 70, 123, 30, 145, 1629, 1, 119, 169, 48, 136, 529, 130,
149, 124, 281, 2483, 0, 60, 149, 50, 29, 124, 149, 0, 92, 149,
915, 47, 50, 89, 143, 84, 129, 147, 138, 80, 33, 226, 70, 146,
177, 98, 150, 32, 148, 149, 12, 338, 146, 204, 149, 148, 26,
149, 1110, 148, 23, 151, 0, 100, 0, 28)), row.names = c(20219L,
42612L, 42149L, 46707L, 40122L, 57449L, 60878L, 56707L, 11725L,
10102L, 29911L, 71743L, 25952L, 1492L, 35570L, 48411L, 43917L,
10530L, 61004L, 46446L, 58846L, 65695L, 44287L, 49341L, 2999L,
48502L, 627L, 54118L, 48148L, 70166L, 13346L, 4770L, 29745L,
67979L, 13832L, 24814L, 10692L, 54744L, 65995L, 8216L, 56683L,
44920L, 18121L, 54499L, 41155L, 71353L, 47606L, 1900L, 25023L,
45811L, 49937L, 54904L, 63607L, 24571L, 36060L, 48479L, 69086L,
37708L, 7353L, 12117L, 33912L, 68752L, 19081L, 8768L, 62647L,
28317L, 43172L, 26286L, 6359L, 14907L, 46733L, 16418L, 43797L,
28637L, 51671L, 1273L, 33677L, 34226L, 65759L, 60247L, 31739L,
38171L, 63497L, 55589L, 44462L, 37454L, 27141L, 36178L, 7543L,
69636L, 54030L, 43173L, 35743L, 852L, 18784L, 39283L, 30672L,
30663L, 14142L, 35933L), class = "data.frame", .Names = c("Day",
"Hour", "sessionID", "uniquePageviews", "timeOnPage"))
It looks like this:
> head(sam)
Day Hour sessionID uniquePageviews timeOnPage
20219 Wed 18 1508980591045.l027p6mt 1 359
42612 Wed 7 1510155616668.57i2wj1 1 149
42149 Wed 3 1510140439620.qu19kyo 1 69
46707 Thurs 22 1510296404412.xasqfwqd10v1qdtl6jemi 1 146
40122 Tues 11 1510082622485.szj2ja1e 1 147
57449 Mon 11 1511204933263.mq9bvi0d 1 119
> glimpse(sam)
Observations: 100
Variables: 5
$ Day <ord> Wed, Wed, Wed, Thurs, Tues, Mon, Tues, Fri, Mon, Mon, Wed, Mon, Tues, Tues, Fri, Sun, Wed, M...
$ Hour <int> 18, 7, 3, 22, 11, 11, 9, 16, 16, 13, 18, 18, 10, 19, 7, 13, 18, 14, 10, 20, 17, 6, 21, 15, 1...
$ sessionID <chr> "1508980591045.l027p6mt", "1510155616668.57i2wj1", "1510140439620.qu19kyo", "1510296404412.x...
$ uniquePageviews <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
$ timeOnPage <dbl> 359, 149, 69, 146, 147, 119, 168, 69, 29, 0, 1542, 148, 242, 49, 457, 175, 175, 97, 79, 12, ...
Metric uniquePageviews will always be 1 or o and in a heatmap it doesn't look great. Since it's session level data there are multiple entries for each day / hour. For timeOnPage I wouldlike to heatmap the mean time on page for a given hour and day of week combination.
So, as far as I can tell ggplot is summing everything whereas I want mean().
My initial code block:
# creates the initial heatmap
p <- ggplot(sam, aes(x = Day, y = Hour, fill = uniquePageviews)) +
geom_tile(color = "white", size = 0.1) +
scale_fill_viridis(name = "TimeOnPage", option ="C")
# order by hour of day going top to bottom asc
p <-p + scale_y_continuous(trans = "reverse", breaks = unique(df$hour))
I tried changing it to this but the results look the exact same:
# gets the initial heatmap
p <- ggplot(sam, aes(x = Day, y = Hour, fill = uniquePageviews),
stat = "summary", fun.y = "mean") +
geom_tile(color = "white", size = 0.1) +
scale_fill_viridis(name = "Mean TimeOnPage", option ="C")
# order by hour of day going top to bottom asc
p <-p + scale_y_continuous(trans = "reverse", breaks = unique(df$hour))
I could do some dplyr group by transformations on the dataframe sam but I was not sure if ggplot::geom_tile() takes care of that or not?
How can I create a heatmap with ggplot where the fill is based on mean? Also, can someone clarify what exactly it's showing now? Total sum?

Not sure if I get your problem but you can try following:
library(tidyverse)
library(viridis)
d %>%
group_by(Day, Hour) %>%
summarise(Mean=mean(timeOnPage)) %>%
ggplot(aes(x = Day, y = Hour, fill = Mean)) +
geom_tile(color = "white", size = 0.1) +
scale_fill_viridis(name = "TimeOnPage", option ="C")
this will caclulate the mean timeOnPage per Day and Hour and plot it as a heatmap.

HLM software output to R: Separate Fixed Effects for each Random Effect

I'm trying to match the output from a model specified in another software, HLM, in R, here: http://justpaste.it/q10n
The model I've tried so far (a random slope with a correlated intercept), isn't matching up for the fixed effects:
m1 <- lmer(formula = mathach ~ 1 + freered + (1 + hrs | school), data = dat)
Here's the R output for comparison:
summary(m1)
Linear mixed model fit by REML ['lmerMod']
Formula:
mathach ~ 1 + freered + (1 + hrs | school)
Data: dat
REML criterion at convergence: 180.2
Scaled residuals:
Min 1Q Median 3Q Max
-2.1706 -0.4274 -0.0611 0.3395 4.3201
Random effects:
Groups Name Variance Std.Dev. Corr
school (Intercept) 102.96318 10.1471
hrs 6.03046 2.4557 -1.00
Residual 0.09409 0.3067
Number of obs: 100, groups: school, 10
Fixed effects:
Estimate Std. Error t value
(Intercept) 68.99782 0.59137 116.67
freered 0.48765 0.03025 16.12
Correlation of Fixed Effects:
(Intr)
freered -0.895
and the dataset:
dat <- structure(list(school = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 9L,
9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L), .Label = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10"), class = "factor"),
student = 1:100, hrs = c(0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 0L, 1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 0L, 1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 0L, 1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L), mathach = c(100, 96, 92, 88, 84, 80, 76, 72, 68,
65, 95, 92, 89, 86, 83, 80, 77, 74, 71, 69, 90, 88, 86, 84,
82, 80, 78, 76, 74, 71, 85, 84, 83, 82, 81, 80, 79, 78, 77,
77, 80, 79.5, 79, 78.5, 78, 77.5, 77, 76.5, 76, 76, 75, 75.5,
76, 76.5, 77, 77.5, 78, 78.5, 79, 80, 70, 71, 72, 73, 74,
75, 76, 77, 78, 80, 65, 67, 69, 71, 73, 75, 77, 79, 81, 82,
60, 63, 66, 69, 72, 75, 78, 81, 84, 86, 55, 59, 63, 67, 71,
75, 79, 83, 87, 93), freered = c(30L, 30L, 30L, 30L, 30L,
30L, 30L, 30L, 30L, 30L, 27L, 27L, 27L, 27L, 27L, 27L, 27L,
27L, 27L, 27L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L,
25L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 20L,
20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 16L, 16L, 16L,
16L, 16L, 16L, 16L, 16L, 16L, 16L, 13L, 13L, 13L, 13L, 13L,
13L, 13L, 13L, 13L, 13L, 11L, 11L, 11L, 11L, 11L, 11L, 11L,
11L, 11L, 11L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L)), .Names = c("school",
"student", "hrs", "mathach", "freered"), row.names = c(NA, -100L
), class = "data.frame")
Any suggestions?

Here's a start: we get closer by (1) putting in an interaction between (1+hrs) and freered in the fixed effects; (2) centering the hrs and freered variables (as stated in the HLM log):
library(lme4)
library(broom) ## for tidy() (cosmetic: may (???) need Github devel version?)
dat2 <- transform(dat,hrs=scale(hrs,scale=FALSE),
freered=scale(freered,scale=FALSE))
m1 <- lmer(formula = mathach ~ (1 + hrs)*freered + (1 + hrs | school),
data = dat2)
## equivalent, explicit response formula:
## ~ 1 + freered + hrs + freered : hrs + (1 + hrs | school)
print(tidy(m1),digits=3)
term estimate std.error statistic group
1 (Intercept) 77.5400 0.2677 289.619 fixed
2 hrs 0.0218 0.1245 0.175 fixed
3 freered 0.3811 0.0307 12.399 fixed
4 hrs:freered -0.2791 0.0143 -19.520 fixed
5 sd_(Intercept).school 0.8411 NA NA school
6 sd_hrs.school 0.3923 NA NA school
7 cor_(Intercept).hrs.school 0.1793 NA NA school
8 sd_Observation.Residual 0.3067 NA NA Residual

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Missing values in heatmap - r

Related

scale_x_date returns error: how can I apply time frame to x-axis?

Batch processing and export of a list of CSV files in R

Why are the fitted values from this linear model not continuous?

ggplot group by fill and show mean

HLM software output to R: Separate Fixed Effects for each Random Effect

Categories

Resources