Related
Reproducible Dataset
This is my data:
#### Data for Stack ####
stack <- structure(list(Mins_Work = c(435L, 350L, 145L, 135L, 15L, 60L,
60L, 390L, 395L, 395L, 315L, 80L, 580L, 175L, 545L, 230L, 435L,
370L, 255L, 515L, 330L, 65L, 115L, 550L, 420L, 45L, 266L, 196L,
198L, 220L, 17L, 382L, 0L, 180L, 343L, 207L, 263L, 332L, 0L,
0L, 259L, 417L, 282L, 685L, 517L, 111L, 64L, 466L, 499L, 460L,
269L, 300L, 427L, 301L, 436L, 342L, 229L, 379L, 102L, 146L, NA,
94L, 345L, 73L, 204L, 512L, 113L, 135L, 458L, 493L, 552L, 108L,
335L, 395L, 508L, 546L, 396L, 159L, 325L, 747L, 650L, 377L, 461L,
669L, 186L, 220L, 410L, 708L, 409L, 515L, 413L, 166L, 451L, 660L,
177L, 192L, 191L, 461L, 637L, 297L, 601L, 586L, 270L, 479L, 0L,
480L, 397L, 174L, 111L, 0L, 610L, 332L, 345L, 423L, 160L, 611L,
0L, 345L, 550L, 324L, 427L, 505L, 632L, 560L, 230L, 495L, 235L,
522L, 654L, 465L, 377L, 260L, 572L, 612L, 594L, 624L, 237L, 0L,
38L, 409L, 634L, 292L, 706L, 399L, 568L, 0L, 694L, 298L, 616L,
553L, 581L, 423L, 636L, 623L, 338L, 345L, 521L, 438L, 504L, 600L,
616L, 656L, 285L, 474L, 688L, 278L, 383L, 535L, 363L, 470L, 457L,
303L, 123L, 363L, 329L, 513L, 636L, 421L, 220L, 430L, 428L, 536L,
156L, 615L, 429L, 103L, 332L, 250L, 281L, 248L, 435L, 589L, 515L,
158L, 0L, 649L, 427L, 193L, 225L, 0L, 280L, 163L, 536L, 301L,
406L, 230L, 519L, 0L, 303L, 472L, 392L, 326L, 368L, 405L, 515L,
308L, 259L, 769L, 93L, 517L, 261L, 420L, 248L, 265L, 834L, 313L,
131L, 298L, 134L, 385L, 648L, 529L, 487L, 533L, 641L, 429L, 339L,
508L, 560L, 439L, 381L, 397L, 692L, NA), Coffee_Cups = c(3L,
0L, 2L, 6L, 4L, 5L, 3L, 3L, 2L, 2L, 3L, 1L, 1L, 3L, 2L, 2L, 0L,
1L, 1L, 4L, 4L, 3L, 0L, 1L, 3L, 0L, 0L, 0L, 0L, 2L, 0L, 1L, 2L,
3L, 2L, 2L, 4L, 3L, 6L, 6L, 3L, 4L, 6L, 8L, 3L, 5L, 0L, 2L, 2L,
8L, 6L, 4L, 6L, 4L, 4L, 2L, 6L, 6L, 5L, 1L, 3L, 1L, 5L, 4L, 6L,
5L, 0L, 6L, 6L, 4L, 4L, 2L, 2L, 6L, 6L, 7L, 3L, 3L, 0L, 5L, 7L,
6L, 3L, 5L, 3L, 3L, 1L, 9L, 9L, 3L, 3L, 6L, 6L, 6L, 3L, 0L, 7L,
6L, 6L, 3L, 9L, 3L, 8L, 8L, 3L, 3L, 7L, 6L, 3L, 3L, 3L, 6L, 6L,
6L, 1L, 9L, 3L, 3L, 2L, 6L, 3L, 6L, 9L, 6L, 8L, 9L, 6L, 6L, 6L,
0L, 3L, 0L, 3L, 3L, 6L, 3L, 0L, 9L, 3L, 0L, 2L, 0L, 6L, 6L, 6L,
3L, 6L, 3L, 9L, 3L, 0L, 0L, 6L, 3L, 3L, 3L, 3L, 6L, 0L, 6L, 3L,
3L, 5L, 5L, 3L, 0L, 6L, 4L, 2L, 0L, 2L, 4L, 0L, 6L, 4L, 4L, 2L,
2L, 0L, 9L, 6L, 3L, 6L, 6L, 9L, 0L, 6L, 6L, 6L, 6L, 6L, 6L, 3L,
3L, 0L, 9L, 6L, 3L, 6L, 3L, 6L, 1L, 6L, 6L, 6L, 6L, 6L, 1L, 3L,
9L, 6L, 3L, 6L, 9L, 3L, 5L, 6L, 3L, 0L, 6L, 3L, 3L, 5L, 0L, 6L,
3L, 5L, 3L, 0L, 6L, 7L, 3L, 6L, 6L, 6L, 6L, 3L, 5L, 6L, 7L, 6L,
6L, 4L, 3L)), class = "data.frame", row.names = c(NA, -244L))
Solution So Far
I'm trying to cut my coffee data into three groups, a low group, a medium group, and a high group. Here is how I tried doing so:
#### Load Libraries ####
library(tidyverse)
library(ggpubr)
#### Transform Data: Coffee ####
coffee_labels <- c("Low", "Medium", "High") # labels
range(stack$Coffee_Cups) # get range for split
coffee_breaks <- seq(from = 0,
to = 9,
by = 3) # split from 0 to 9 in 3 pt intervals
coffee_transform <- cut(x= stack$Coffee_Cups,
labels = coffee_labels,
breaks = coffee_breaks) # add labels and breaks
stack_transform <- stack %>%
mutate(coffee_level = coffee_transform) # mutate to add to data
tail(stack_transform$coffee_level, 30) # check transform
Problem
However, when I print the tail command at the end, I get these NA values, which I assume is from an improper cut:
[1] Low Medium Medium Low <NA> Medium Low Low Medium <NA> Medium
[12] Low Medium Low <NA> Medium High Low Medium Medium Medium Medium
[23] Low Medium Medium High Medium Medium Medium Low
Levels: Low Medium High
I looked and those values correspond to my coffee consumption equaling zero, yet I already set the cut from 0 to 9. Naturally, when I try to make a boxplot with this, the NA levels get included, which I don't want:
#### Transform Coffee Boxplot ####
ggboxplot(stack_transform,
x="coffee_level",
y="Mins_Work",
palette = "simpsons",
color = "coffee_level",
title = "Coffee Consumption Level Productivity",
caption = "*Data obtained from local matrix.",
xlab = "Coffee Consumption Level",
ylab = "Minutes of Productivity")+
theme_bw()+
theme(legend.position = "none",
plot.caption = element_text(face = "italic"),
plot.title = element_text(face = "bold",
size = 18,
family = "mono"))
Question
How do I fix these NA values? I want my zero values to be included into the "low" group if possible.
Would this work for you? It seems you are using base R, so a nested ifelse statement may be simpler:
stack$coffee_cat <- ifelse(stack$Coffee_Cups %in% 0:3, "Low",
ifelse(stack$Coffee_Cups %in% 4:6, "Medium",
ifelse(stack$Coffee_Cups %in% 7:9, "High", NA)))
Output
# Mins_Work Coffee_Cups coffee_cat
#1 435 3 Low
#2 350 0 Low
#3 145 2 Low
#4 135 6 Medium
#5 15 4 Medium
#6 60 5 Medium
case_when would be a dplyr alternative:
stack %>% mutate(coffee_level = case_when(Coffee_Cups %in% 0:3 ~ "Low",
Coffee_Cups %in% 4:6 ~ "Medium",
Coffee_Cups %in% 7:9 ~ "High"))
To include the zeros in cutting, you could also use the Hmisc::cut2function:
stack$coffee_Hmisc <- factor(Hmisc::cut2(stack$Coffee_Cups, g = 3), labels = coffee_labels)
I have four datasets derived and processed identically (though differing in size due to the availability of Landsat scenes)
I am trying to compute ANOVA using the formula:
res.aov <- anova_test(
data = LST_Weather_dataset_ANOVA, dv = LST, wid = JulianDay,
within = c(Buffer, TimePeriod),
effect.size = "ges",
detailed = TRUE,
)
get_anova_table(res.aov, correction = "auto")
Where:
*) LST = surface temperature deviation in C
*) JulianDay = days since start of year
*) Buffer = a value 100-1900 - one of 19 areas outward from the boundary of a solar power plant (each 100m wide)
*) TimePeriod = a factor with a value of 0 or 1 corresponding to pre-/post-construction of the solar power plant.
The intent is to investigate if the construction of the installation affected the adjacent land surface temperature.
At three sites the ANOVA runs successfully, however at the fourth site it doesn't and fails with the error:
Error in lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
0 (non-NA) cases
I have 381 rows of data in 4 columns (extract below), the only difference I can think of here is that I had to remove two paired months from the time series as data was not available in one of the months. This means there are 20 months of data, rather than 24. Every other processing step is identical.
Reading online I have searched for N/As (there are none), and can't see how there are levels without values as every cell has data. I don't know how to properly evaluate this, though, as it seems this is the root of the error.
I'm hoping someone will know the code needed and/or be able to suggest a way forwards.
Buffer LST JulianDay TimePeriod
1800 -0.04576149 73 2
1900 -0.03422945 73 2
1900 -0.02089755 302 1
1900 -0.02062432 96 1
1900 -0.01465229 192 1
1900 -0.00643754 128 1
1900 -0.00333345 105 2
1800 -0.00266312 366 1
1900 -0.00181226 201 2
1900 -0.00158173 169 2
1900 -1.81E-05 41 2
1800 0.00144813 128 1
and 367 additional rows...
[Edits]
Per comments below:
dput() whole dataframe
dput() subset (as suggested)
Thanks #Dion for noting anova_test is from the RStatix package.
1)
> dput(LST_Weather_dataset_ANOVA)
structure(list(Buffer = c(100L, 200L, 300L, 400L, 500L, 600L,
700L, 800L, 900L, 1000L, 1100L, 1200L, 1300L, 1400L, 1500L, 1600L,
1700L, 1800L, 1900L, 100L, 200L, 300L, 400L, 500L, 600L, 700L,
800L, 900L, 1000L, 1100L, 1200L, 1300L, 1400L, 1500L, 1600L,
1700L, 1800L, 1900L, 100L, 200L, 300L, 400L, 500L, 600L, 700L,
800L, 900L, 1000L, 1100L, 1200L, 1300L, 1400L, 1500L, 1600L,
1700L, 1800L, 1900L, 100L, 200L, 300L, 400L, 500L, 600L, 700L,
800L, 900L, 1000L, 1100L, 1200L, 1300L, 1400L, 1500L, 1600L,
1700L, 1800L, 1900L, 100L, 200L, 300L, 400L, 500L, 600L, 700L,
800L, 900L, 1000L, 1100L, 1200L, 1300L, 1400L, 1500L, 1600L,
1700L, 1800L, 1900L, 100L, 200L, 300L, 400L, 500L, 600L, 700L,
800L, 900L, 1000L, 1100L, 1200L, 1300L, 1400L, 1500L, 1600L,
1700L, 1800L, 1900L, 100L, 200L, 300L, 400L, 500L, 600L, 700L,
800L, 900L, 1000L, 1100L, 1200L, 1300L, 1400L, 1500L, 1600L,
1700L, 1800L, 1900L, 100L, 200L, 300L, 400L, 500L, 600L, 700L,
800L, 900L, 1000L, 1100L, 1200L, 1300L, 1400L, 1500L, 1600L,
1700L, 1800L, 1900L, 100L, 200L, 300L, 400L, 500L, 600L, 700L,
800L, 900L, 1000L, 1100L, 1200L, 1300L, 1400L, 1500L, 1600L,
1700L, 1800L, 1900L, 100L, 200L, 300L, 400L, 500L, 600L, 700L,
800L, 900L, 1000L, 1100L, 1200L, 1300L, 1400L, 1500L, 1600L,
1700L, 1800L, 1900L, 100L, 200L, 300L, 400L, 500L, 600L, 700L,
800L, 900L, 1000L, 1100L, 1200L, 1300L, 1400L, 1500L, 1600L,
1700L, 1800L, 1900L, 100L, 200L, 300L, 400L, 500L, 600L, 700L,
800L, 900L, 1000L, 1100L, 1200L, 1300L, 1400L, 1500L, 1600L,
1700L, 1800L, 1900L, 100L, 200L, 300L, 400L, 500L, 600L, 700L,
800L, 900L, 1000L, 1100L, 1200L, 1300L, 1400L, 1500L, 1600L,
1700L, 1800L, 1900L, 100L, 200L, 300L, 400L, 500L, 600L, 700L,
800L, 900L, 1000L, 1100L, 1200L, 1300L, 1400L, 1500L, 1600L,
1700L, 1800L, 1900L, 100L, 200L, 300L, 400L, 500L, 600L, 700L,
800L, 900L, 1000L, 1100L, 1200L, 1300L, 1400L, 1500L, 1600L,
1700L, 1800L, 1900L, 100L, 200L, 300L, 400L, 500L, 600L, 700L,
800L, 900L, 1000L, 1100L, 1200L, 1300L, 1400L, 1500L, 1600L,
1700L, 1800L, 1900L, 100L, 200L, 300L, 400L, 500L, 600L, 700L,
800L, 900L, 1000L, 1100L, 1200L, 1300L, 1400L, 1500L, 1600L,
1700L, 1800L, 1900L, 100L, 200L, 300L, 400L, 500L, 600L, 700L,
800L, 900L, 1000L, 1100L, 1200L, 1300L, 1400L, 1500L, 1600L,
1700L, 1800L, 1900L, 100L, 200L, 300L, 400L, 500L, 600L, 700L,
800L, 900L, 1000L, 1100L, 1200L, 1300L, 1400L, 1500L, 1600L,
1700L, 1800L, 1900L, 100L, 200L, 300L, 400L, 500L, 600L, 700L,
800L, 900L, 1000L, 1100L, 1200L, 1300L, 1400L, 1500L, 1600L,
1700L, 1800L, 1900L), LST = c(0.91797777, 0.95083024, 0.79129483,
0.74791195, 0.68740945, 0.64516119, 0.74870729, 0.78357522, 0.83663769,
0.82156894, 0.77440129, 0.62769619, 0.52052404, 0.46497939, 0.34456476,
0.20359411, 0.11688336, 0.04136486, -0.02089755, 1.15111659,
1.20353638, 1.11717501, 1.0286145, 0.90984545, 0.82983601, 0.78186792,
0.73227976, 0.6989393, 0.65015275, 0.56241798, 0.39651023, 0.34213091,
0.3386525, 0.24000145, 0.11809023, 0.07704512, -0.00266312, 0.01273022,
1.04229626, 1.14347392, 1.1156609, 1.10575157, 1.01202522, 0.77829087,
0.80477079, 0.79677169, 0.83116477, 0.83242401, 0.82394197, 0.72073306,
0.64099082, 0.58188225, 0.43328083, 0.28349521, 0.19752629, 0.10636456,
0.01987005, 0.74458844, 0.71512573, 0.6395358, 0.65294657, 0.63325921,
0.56155255, 0.60860815, 0.60614753, 0.59989994, 0.58766288, 0.57257261,
0.50018929, 0.4367402, 0.40497079, 0.31822141, 0.2300726, 0.16928876,
0.09449034, 0.01799424, 0.82747052, 0.78262774, 0.65488597, 0.62609552,
0.60057131, 0.59950609, 0.6609992, 0.6876772, 0.73196883, 0.75516596,
0.75554112, 0.64167458, 0.54703129, 0.49947692, 0.38230481, 0.25519237,
0.16087274, 0.07759223, 0.00820849, 0.75009747, 0.71421977, 0.62411035,
0.58621041, 0.58438012, 0.61346156, 0.72712994, 0.81372726, 0.87579554,
0.88934787, 0.87369461, 0.74686202, 0.64084028, 0.5599638, 0.40021941,
0.23612052, 0.13408522, 0.04484869, -0.02062432, 0.22133116,
0.28562902, 0.24359043, 0.17788898, 0.16563242, 0.11740664, 0.10102937,
0.07328697, 0.07948283, 0.07521508, 0.08526232, 0.0548022, 0.04632606,
0.06670398, 0.03262545, 0.00650875, 0.01186519, 0.00144813, -0.00643754,
0.26360849, 0.22139941, 0.16915041, 0.13499715, 0.12846785, 0.15351528,
0.15321108, 0.13963269, 0.13413671, 0.13097696, 0.15897844, 0.15489366,
0.12600815, 0.12363834, 0.0943688, 0.07324289, 0.0565765, 0.04005241,
0.01346488, 0.42361198, 0.39149841, 0.29086274, 0.21492842, 0.20664552,
0.24524285, 0.30548979, 0.35256808, 0.37350282, 0.38680061, 0.38567758,
0.31177736, 0.24643091, 0.22001284, 0.14356522, 0.07076854, 0.04168654,
0.01276553, -0.01465229, 0.57032414, 0.50658577, 0.41717664,
0.36134446, 0.35794989, 0.38457285, 0.43700723, 0.48358206, 0.50516801,
0.50086146, 0.49398709, 0.41516438, 0.33165215, 0.28357127, 0.20030152,
0.11993505, 0.08438345, 0.05755944, 0.01071499, 0.04963208, 0.34087747,
0.38385889, 0.40408637, 0.41182138, 0.15662208, 0.18857013, 0.17978741,
0.1533216, 0.1451422, 0.14890638, 0.14090521, 0.1782449, 0.23624089,
0.21003477, 0.13812217, 0.10759364, 0.07225312, 0.03185378, 0.27507486,
0.54404521, 0.56568824, 0.58543167, 0.49124799, 0.28299777, 0.27514982,
0.27526446, 0.27376722, 0.24620415, 0.22871699, 0.19647326, 0.2450593,
0.27133386, 0.15248773, 0.06240341, 0.04933824, 0.03356535, -1.81e-05,
0.21776379, 0.37010032, 0.32743525, 0.30588107, 0.31226738, 0.30518286,
0.32637517, 0.31003415, 0.23691586, 0.1985241, 0.16143326, 0.12384526,
0.11556386, 0.09243356, 0.05773894, 0.03660942, 0.02173758, -0.04576149,
-0.03422945, 0.06214728, 0.26440563, 0.24838816, 0.22704611,
0.17230754, 0.15660109, 0.18689433, 0.24464547, 0.28273218, 0.29602945,
0.29992488, 0.24679735, 0.24521192, 0.23913767, 0.15081173, 0.08724556,
0.05561237, 0.02530266, -0.00333345, 0.11993489, 0.20504424,
0.17323488, 0.14541868, 0.10994579, 0.12741154, 0.17959797, 0.22553943,
0.26564836, 0.29760832, 0.3207305, 0.28592135, 0.26551685, 0.2493214,
0.15767906, 0.0883716, 0.05058495, 0.02207594, 0.00162532, 0.05621313,
0.08020623, 0.05187855, 0.02643543, 0.02422505, 0.05372454, 0.09563737,
0.14735627, 0.18199015, 0.22456299, 0.25302274, 0.21978124, 0.19092835,
0.18255829, 0.11850551, 0.0581734, 0.03406168, 0.01868243, -0.00158173,
0.00980756, 0.07077972, 0.05126985, 0.03126771, 0.01828044, 0.00678076,
0.03566275, 0.05622289, 0.07218645, 0.08767578, 0.11078182, 0.08827425,
0.08881865, 0.10037876, 0.05952601, 0.03440435, 0.01843206, 0.0091852,
-0.00181226, 0.08737325, 0.14470842, 0.13066747, 0.12324597,
0.12014198, 0.13435757, 0.17843025, 0.19926835, 0.20503774, 0.20485414,
0.2124073, 0.1864257, 0.18810996, 0.20665551, 0.13839744, 0.08488387,
0.06246853, 0.03463723, 0.00349753, 0.35245488, 0.57692156, 0.64897028,
0.67306088, 0.68344534, 0.56106697, 0.52144197, 0.49250191, 0.47494065,
0.4359944, 0.39638743, 0.32554099, 0.28717774, 0.2826675, 0.22703594,
0.18186983, 0.15875118, 0.09672536, 0.04305742, 0.24294606, 0.54654222,
0.56344638, 0.53312729, 0.47324972, 0.34482643, 0.34915085, 0.33729055,
0.32086985, 0.29578347, 0.25030669, 0.17928298, 0.17007511, 0.18375903,
0.15222616, 0.10934224, 0.07536797, 0.04154465, 0.02550096),
JulianDay = c(302L, 302L, 302L, 302L, 302L, 302L, 302L, 302L,
302L, 302L, 302L, 302L, 302L, 302L, 302L, 302L, 302L, 302L,
302L, 366L, 366L, 366L, 366L, 366L, 366L, 366L, 366L, 366L,
366L, 366L, 366L, 366L, 366L, 366L, 366L, 366L, 366L, 366L,
16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L,
16L, 16L, 16L, 16L, 16L, 16L, 16L, 64L, 64L, 64L, 64L, 64L,
64L, 64L, 64L, 64L, 64L, 64L, 64L, 64L, 64L, 64L, 64L, 64L,
64L, 64L, 80L, 80L, 80L, 80L, 80L, 80L, 80L, 80L, 80L, 80L,
80L, 80L, 80L, 80L, 80L, 80L, 80L, 80L, 80L, 96L, 96L, 96L,
96L, 96L, 96L, 96L, 96L, 96L, 96L, 96L, 96L, 96L, 96L, 96L,
96L, 96L, 96L, 96L, 128L, 128L, 128L, 128L, 128L, 128L, 128L,
128L, 128L, 128L, 128L, 128L, 128L, 128L, 128L, 128L, 128L,
128L, 128L, 160L, 160L, 160L, 160L, 160L, 160L, 160L, 160L,
160L, 160L, 160L, 160L, 160L, 160L, 160L, 160L, 160L, 160L,
160L, 192L, 192L, 192L, 192L, 192L, 192L, 192L, 192L, 192L,
192L, 192L, 192L, 192L, 192L, 192L, 192L, 192L, 192L, 192L,
224L, 224L, 224L, 224L, 224L, 224L, 224L, 224L, 224L, 224L,
224L, 224L, 224L, 224L, 224L, 224L, 224L, 224L, 224L, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L,
9L, 9L, 9L, 41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L,
41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L, 41L, 73L, 73L,
73L, 73L, 73L, 73L, 73L, 73L, 73L, 73L, 73L, 73L, 73L, 73L,
73L, 73L, 73L, 73L, 73L, 105L, 105L, 105L, 105L, 105L, 105L,
105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L,
105L, 105L, 105L, 137L, 137L, 137L, 137L, 137L, 137L, 137L,
137L, 137L, 137L, 137L, 137L, 137L, 137L, 137L, 137L, 137L,
137L, 137L, 169L, 169L, 169L, 169L, 169L, 169L, 169L, 169L,
169L, 169L, 169L, 169L, 169L, 169L, 169L, 169L, 169L, 169L,
169L, 201L, 201L, 201L, 201L, 201L, 201L, 201L, 201L, 201L,
201L, 201L, 201L, 201L, 201L, 201L, 201L, 201L, 201L, 201L,
217L, 217L, 217L, 217L, 217L, 217L, 217L, 217L, 217L, 217L,
217L, 217L, 217L, 217L, 217L, 217L, 217L, 217L, 217L, 313L,
313L, 313L, 313L, 313L, 313L, 313L, 313L, 313L, 313L, 313L,
313L, 313L, 313L, 313L, 313L, 313L, 313L, 313L, 361L, 361L,
361L, 361L, 361L, 361L, 361L, 361L, 361L, 361L, 361L, 361L,
361L, 361L, 361L, 361L, 361L, 361L, 361L), TimePeriod = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L)), class = "data.frame", row.names = c(NA,
-380L))
> dput(LST_Weather_dataset_ANOVA[sample(1:nrow(LST_Weather_dataset_ANOVA), 50),])
structure(list(Buffer = c(800L, 1400L, 500L, 200L, 400L, 1400L,
100L, 1600L, 1800L, 100L, 1400L, 1500L, 900L, 700L, 800L, 600L,
400L, 1300L, 500L, 700L, 700L, 300L, 700L, 200L, 200L, 500L,
500L, 900L, 1000L, 1300L, 1400L, 1600L, 700L, 400L, 500L, 200L,
400L, 1500L, 1400L, 800L, 500L, 1200L, 1500L, 1900L, 600L, 800L,
100L, 1000L, 900L, 1100L), LST = c(0.48358206, 0.46497939, 0.41182138,
0.07077972, 0.17788898, 0.18255829, 0.21776379, 0.03660942, 0.04154465,
0.42361198, 0.49947692, 0.38230481, 0.28273218, 0.18857013, 0.33729055,
0.56106697, 0.13499715, 0.28717774, 0.12014198, 0.78186792, 0.74870729,
0.56344638, 0.18689433, 0.54404521, 0.78262774, 0.60057131, 1.01202522,
0.20503774, 0.13097696, 0.34213091, 0.5599638, 0.08724556, 0.17843025,
1.0286145, 0.01828044, 0.22139941, 0.67306088, 0.15248773, 0.22001284,
0.27526446, 0.02422505, 0.50018929, 0.31822141, 0.01799424, 0.56155255,
0.13963269, 0.27507486, 0.29578347, 0.18199015, 0.3207305), JulianDay = c(224L,
302L, 9L, 201L, 128L, 169L, 73L, 73L, 361L, 192L, 80L, 80L, 105L,
9L, 361L, 313L, 160L, 313L, 217L, 366L, 302L, 361L, 105L, 41L,
80L, 80L, 16L, 217L, 160L, 366L, 96L, 105L, 217L, 366L, 201L,
160L, 313L, 41L, 192L, 41L, 169L, 64L, 64L, 64L, 64L, 160L, 41L,
361L, 169L, 137L), TimePeriod = c(1L, 1L, 2L, 2L, 1L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L,
2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 1L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L)), row.names = c(179L,
14L, 195L, 306L, 118L, 299L, 229L, 244L, 379L, 153L, 90L, 91L,
256L, 197L, 369L, 348L, 137L, 355L, 328L, 26L, 7L, 364L, 254L,
211L, 78L, 81L, 43L, 332L, 143L, 32L, 109L, 263L, 330L, 23L,
309L, 135L, 346L, 224L, 166L, 217L, 290L, 69L, 72L, 76L, 63L,
141L, 210L, 371L, 294L, 277L), class = "data.frame")
You are running a repeated anova and this requires the observations to be complete for every individual, within your specified within effects. In your case, you need the ensure for every JulianDay, the observations are complete for every combination of Buffer and TimePeriod
We can tabulate it by using table() and you can see for all the JulianDays they are incomplete, for example on 9 and 16:
with(LST_Weather_dataset_ANOVA,table(Buffer,TimePeriod,JulianDay))[,,c("9","16")]
, , JulianDay = 9
TimePeriod
Buffer 1 2
100 0 1
200 0 1
300 0 1
400 0 1
500 0 1
600 0 1
700 0 1
800 0 1
900 0 1
1000 0 1
1100 0 1
1200 0 1
1300 0 1
1400 0 1
1500 0 1
1600 0 1
1700 0 1
1800 0 1
1900 0 1
, , JulianDay = 16
TimePeriod
Buffer 1 2
100 1 0
200 1 0
300 1 0
400 1 0
500 1 0
600 1 0
700 1 0
800 1 0
900 1 0
1000 1 0
1100 1 0
1200 1 0
1300 1 0
1400 1 0
1500 1 0
1600 1 0
1700 1 0
1800 1 0
1900 1 0
As you have noted, if you reconcile the dates between sites, it will work. I am not very sure how you converted the JulianDay to months, but using your data, it works if I just do
df = LST_Weather_dataset_ANOVA
df$Month = months(strptime(paste("2020",df$JulianDay),"%Y %j"))
df = subset(df,Month %in% c("May","June"))
with(df,table(Buffer,TimePeriod,Month))
, , Month = June
TimePeriod
Buffer 1 2
100 1 1
200 1 1
300 1 1
400 1 1
500 1 1
600 1 1
700 1 1
800 1 1
900 1 1
1000 1 1
1100 1 1
1200 1 1
1300 1 1
1400 1 1
1500 1 1
1600 1 1
1700 1 1
1800 1 1
1900 1 1
, , Month = May
TimePeriod
Buffer 1 2
100 1 1
200 1 1
300 1 1
400 1 1
500 1 1
600 1 1
700 1 1
800 1 1
900 1 1
1000 1 1
1100 1 1
1200 1 1
1300 1 1
1400 1 1
1500 1 1
1600 1 1
1700 1 1
1800 1 1
1900 1 1
You can see for months June and May, they are complete (no zeros), and if we run anova, it works:
res.aov <- anova_test(
data = df, dv = LST, wid = Month,
within = c(Buffer, TimePeriod),
effect.size = "ges",
detailed = TRUE,
)
ANOVA Table (type III tests)
Effect DFn DFd SSn SSd F p p<.05 ges
1 (Intercept) 1 1 1.217 0.005 222.936 4.30e-02 * 0.933
2 Buffer 18 18 0.256 0.026 9.933 5.49e-06 * 0.746
3 TimePeriod 1 1 0.013 0.048 0.274 6.93e-01 0.130
4 Buffer:TimePeriod 18 18 0.181 0.008 21.476 1.20e-08 * 0.674
While ironing last night I wondered if JulianDay might be the source of the error. It is derived from the dates of the Landsat scenes from dependant variable data are derived, so is different for each site.
Editing the dataframe to replace the JulianDay column with Month and amending the code to:
str(LST_Weather_dataset_ANOVA)
res.aov <- anova_test(
data = LST_Weather_dataset_ANOVA, dv = LST, wid = Month,
within = c(Buffer, TimePeriod),
effect.size = "ges",
detailed = TRUE,
)
get_anova_table(res.aov, correction = "auto")
...the ANOVA test runs successfully:
> res.aov <- anova_test(
+ data = LST_Weather_dataset_ANOVA, dv = LST, wid = Month,
+ within = c(Buffer, TimePeriod),
+ effect.size = "ges",
+ detailed = TRUE,
+ )
> get_anova_table(res.aov, correction = "auto")
ANOVA Table (type III tests)
Effect DFn DFd SSn SSd F p p<.05 ges
1 (Intercept) 1 9 36.781 6.593 50.212 5.75e-05 * 0.735
2 Buffer 18 162 8.042 3.041 23.801 1.81e-36 * 0.378
3 TimePeriod 1 9 5.065 2.506 18.194 2.00e-03 * 0.276
4 Buffer:TimePeriod 18 162 1.713 1.117 13.800 2.71e-24 * 0.114
But I still don't fully understand why...
Hopefully this will enable someone to comment and provide an explanation?
I am new to this website and to coding as well. I was wondering if any of you could help me out
I need to calculate the Top 5 Movies, by rating distribution, calculating the percentage of ratings for each movie that are 4 stars or higher.
So far I was only able to calculate the number of occurrences using dplyr.
Is it possible to calculate it using dplyr (something similar to my coding)?
I'm not sure whether I need to mutate to come up with the solution or if there's another way to do so.
My code so far:
dfAux1 <- na.omit(dfAux)
dfAux1 %>%
group_by(movie) %>%
summarise(tot = n()) %>%
arrange(desc(tot))%>%
head(5)
the result should be something like this:
**Expected result**:
0.7000000, 'The Shawshank Redemption'
0.5333333, 'Star Wars IV - A New Hope'
0.5000000, 'Gladiator'
0.4444444, 'Blade Runner'
0.4375000, 'The Silence of the Lambs'
and so far this is my result:
# A tibble: 5 x 2
movie tot
<fctr> <int>
1 Toy Story 17
2 The Silence of the Lambs 16
3 Star Wars IV - A New Hope 15
4 Star Wars VI - Return of the Jedi 14
5 Independence Day 13
edit:
str(dfAux1)
'data.frame': 241 obs. of 2 variables:
$ Rating: int 1 5 4 2 4 5 4 2 3 2 ...
$ movie : Factor w/ 20 levels "Star Wars IV - A New Hope",..: 1 1 1 1 1 1 1 1 1 1 ...
- attr(*, "na.action")=Class 'omit' Named int [1:159] 3 4 7 16 17 23 27 28 34 36 ...
.. ..- attr(*, "names")= chr [1:159] "3" "4" "7" "16" ...
dput(dfAux1)
structure(list(Rating = c(1L, 5L, 4L, 2L, 4L, 5L, 4L, 2L, 3L,
2L, 3L, 4L, 4L, 5L, 1L, 5L, 3L, 3L, 3L, 4L, 1L, 2L, 1L, 5L, 3L,
4L, 5L, 1L, 2L, 2L, 4L, 4L, 3L, 5L, 2L, 3L, 1L, 1L, 2L, 2L, 5L,
1L, 4L, 1L, 4L, 5L, 5L, 5L, 4L, 4L, 4L, 2L, 4L, 1L, 3L, 2L, 3L,
2L, 4L, 2L, 5L, 3L, 4L, 1L, 5L, 4L, 2L, 1L, 1L, 4L, 2L, 4L, 5L,
5L, 2L, 1L, 4L, 2L, 1L, 4L, 2L, 3L, 2L, 4L, 4L, 5L, 2L, 4L, 3L,
2L, 2L, 4L, 2L, 2L, 2L, 3L, 4L, 1L, 5L, 4L, 3L, 5L, 2L, 1L, 3L,
4L, 4L, 2L, 3L, 4L, 1L, 3L, 2L, 5L, 3L, 2L, 3L, 4L, 1L, 1L, 4L,
1L, 4L, 5L, 1L, 3L, 2L, 2L, 3L, 5L, 5L, 1L, 2L, 3L, 5L, 2L, 3L,
1L, 2L, 1L, 4L, 1L, 2L, 2L, 3L, 3L, 2L, 1L, 1L, 1L, 5L, 2L, 4L,
1L, 4L, 3L, 1L, 2L, 2L, 3L, 4L, 2L, 3L, 2L, 4L, 3L, 4L, 3L, 2L,
2L, 4L, 5L, 2L, 1L, 5L, 1L, 4L, 5L, 2L, 3L, 3L, 2L, 5L, 5L, 4L,
1L, 3L, 1L, 2L, 1L, 5L, 5L, 2L, 4L, 2L, 4L, 2L, 5L, 2L, 5L, 5L,
1L, 5L, 1L, 3L, 2L, 2L, 3L, 5L, 1L, 3L, 1L, 5L, 3L, 3L, 1L, 2L,
4L, 1L, 5L, 3L, 1L, 1L, 5L, 5L, 1L, 5L, 3L, 3L, 2L, 3L, 3L, 2L,
2L, 2L, 5L, 4L, 2L, 1L, 4L, 5L), movie = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L,
9L, 9L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L,
11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 12L, 12L,
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 13L, 13L, 13L,
13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 14L, 14L, 14L,
14L, 14L, 14L, 14L, 14L, 14L, 15L, 15L, 15L, 15L, 15L, 15L, 15L,
15L, 15L, 15L, 15L, 15L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L,
16L, 16L, 16L, 16L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L,
17L, 17L, 17L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L,
18L, 18L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L,
20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L), .Label = c("Star Wars IV - A New Hope",
"Star Wars VI - Return of the Jedi", "Forrest Gump", "The Shawshank Redemption",
"The Silence of the Lambs", "Gladiator", "Toy Story", "Saving Private Ryan",
"Pulp Fiction", "Stand by Me", "Shakespeare in Love", "Total Recall",
"Independence Day", "Blade Runner", "Groundhog Day", "The Matrix",
"Schindler's List", "The Sixth Sense", "Raiders of the Lost Ark",
"Babe"), class = "factor")), .Names = c("Rating", "movie"), row.names = c(1L,
2L, 5L, 6L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 18L, 19L, 20L,
21L, 22L, 24L, 25L, 26L, 29L, 30L, 31L, 32L, 33L, 35L, 38L, 39L,
40L, 41L, 45L, 46L, 47L, 51L, 52L, 54L, 56L, 58L, 60L, 62L, 63L,
65L, 66L, 67L, 69L, 70L, 73L, 78L, 80L, 81L, 82L, 83L, 85L, 87L,
88L, 89L, 90L, 92L, 93L, 94L, 95L, 96L, 97L, 98L, 100L, 101L,
102L, 104L, 105L, 107L, 108L, 109L, 111L, 115L, 116L, 118L, 119L,
121L, 122L, 123L, 124L, 126L, 128L, 129L, 130L, 131L, 132L, 133L,
134L, 135L, 137L, 138L, 139L, 140L, 141L, 144L, 145L, 146L, 147L,
149L, 150L, 153L, 156L, 159L, 160L, 164L, 166L, 167L, 168L, 170L,
172L, 175L, 177L, 178L, 179L, 180L, 181L, 182L, 183L, 185L, 186L,
189L, 194L, 195L, 196L, 199L, 200L, 201L, 202L, 205L, 206L, 207L,
209L, 212L, 216L, 217L, 219L, 220L, 222L, 223L, 224L, 225L, 226L,
228L, 229L, 231L, 233L, 234L, 235L, 239L, 241L, 242L, 243L, 244L,
246L, 248L, 249L, 250L, 251L, 252L, 253L, 254L, 255L, 261L, 263L,
264L, 265L, 267L, 268L, 274L, 278L, 280L, 282L, 283L, 284L, 286L,
288L, 289L, 292L, 293L, 294L, 295L, 296L, 300L, 301L, 303L, 305L,
307L, 310L, 311L, 312L, 314L, 316L, 317L, 319L, 320L, 321L, 322L,
323L, 324L, 325L, 328L, 330L, 334L, 335L, 336L, 338L, 340L, 341L,
342L, 343L, 344L, 345L, 346L, 348L, 350L, 351L, 356L, 358L, 360L,
362L, 363L, 364L, 367L, 368L, 371L, 373L, 375L, 376L, 378L, 380L,
383L, 384L, 386L, 387L, 389L, 391L, 392L, 395L, 396L, 398L), class = "data.frame", na.action = structure(c(3L,
4L, 7L, 16L, 17L, 23L, 27L, 28L, 34L, 36L, 37L, 42L, 43L, 44L,
48L, 49L, 50L, 53L, 55L, 57L, 59L, 61L, 64L, 68L, 71L, 72L, 74L,
75L, 76L, 77L, 79L, 84L, 86L, 91L, 99L, 103L, 106L, 110L, 112L,
113L, 114L, 117L, 120L, 125L, 127L, 136L, 142L, 143L, 148L, 151L,
152L, 154L, 155L, 157L, 158L, 161L, 162L, 163L, 165L, 169L, 171L,
173L, 174L, 176L, 184L, 187L, 188L, 190L, 191L, 192L, 193L, 197L,
198L, 203L, 204L, 208L, 210L, 211L, 213L, 214L, 215L, 218L, 221L,
227L, 230L, 232L, 236L, 237L, 238L, 240L, 245L, 247L, 256L, 257L,
258L, 259L, 260L, 262L, 266L, 269L, 270L, 271L, 272L, 273L, 275L,
276L, 277L, 279L, 281L, 285L, 287L, 290L, 291L, 297L, 298L, 299L,
302L, 304L, 306L, 308L, 309L, 313L, 315L, 318L, 326L, 327L, 329L,
331L, 332L, 333L, 337L, 339L, 347L, 349L, 352L, 353L, 354L, 355L,
357L, 359L, 361L, 365L, 366L, 369L, 370L, 372L, 374L, 377L, 379L,
381L, 382L, 385L, 388L, 390L, 393L, 394L, 397L, 399L, 400L), .Names = c("3",
"4", "7", "16", "17", "23", "27", "28", "34", "36", "37", "42",
"43", "44", "48", "49", "50", "53", "55", "57", "59", "61", "64",
"68", "71", "72", "74", "75", "76", "77", "79", "84", "86", "91",
"99", "103", "106", "110", "112", "113", "114", "117", "120",
"125", "127", "136", "142", "143", "148", "151", "152", "154",
"155", "157", "158", "161", "162", "163", "165", "169", "171",
"173", "174", "176", "184", "187", "188", "190", "191", "192",
"193", "197", "198", "203", "204", "208", "210", "211", "213",
"214", "215", "218", "221", "227", "230", "232", "236", "237",
"238", "240", "245", "247", "256", "257", "258", "259", "260",
"262", "266", "269", "270", "271", "272", "273", "275", "276",
"277", "279", "281", "285", "287", "290", "291", "297", "298",
"299", "302", "304", "306", "308", "309", "313", "315", "318",
"326", "327", "329", "331", "332", "333", "337", "339", "347",
"349", "352", "353", "354", "355", "357", "359", "361", "365",
"366", "369", "370", "372", "374", "377", "379", "381", "382",
"385", "388", "390", "393", "394", "397", "399", "400"), class = "omit"))
I am using data.table instead of dplyr
library(data.table)
setDT(dfAux1) # make dfAux1 as data table by reference
# calculate total number by movies, then compute percent for `Rating >= 4` by movies and then sort `tot` by descending order and also eliminating duplicates in movies using `.SD[1]` which gives the first row in each movie.
dfAux1[, .(Rating, tot = .N), by = movie ][Rating >= 4, .(percent = .N/tot, tot), by = movie ][order(-tot), .SD[1], by = movie]
# movie percent tot
# 1: Toy Story 0.35294118 17
# 2: The Silence of the Lambs 0.43750000 16
# 3: Star Wars IV - A New Hope 0.53333333 15
# 4: Star Wars VI - Return of the Jedi 0.35714286 14
# 5: Independence Day 0.30769231 13
# 6: Gladiator 0.50000000 12
# 7: Total Recall 0.08333333 12
# 8: Groundhog Day 0.41666667 12
# 9: The Matrix 0.41666667 12
# 10: Schindler's List 0.33333333 12
# 11: The Sixth Sense 0.33333333 12
# 12: Saving Private Ryan 0.36363636 11
# 13: Pulp Fiction 0.36363636 11
# 14: Stand by Me 0.36363636 11
# 15: Shakespeare in Love 0.27272727 11
# 16: Raiders of the Lost Ark 0.27272727 11
# 17: Forrest Gump 0.30000000 10
# 18: The Shawshank Redemption 0.70000000 10
# 19: Babe 0.40000000 10
# 20: Blade Runner 0.44444444 9
A single line solution using data.table and data from OP could be as:
library(data.table)
setDT(dfAux1)[, .(pct = sum(Rating>=4)/.N), by=movie][order(-pct)][1:5]
movie pct
1: The Shawshank Redemption 0.7000000
2: Star Wars IV - A New Hope 0.5333333
3: Gladiator 0.5000000
4: Blade Runner 0.4444444
5: The Silence of the Lambs 0.4375000
Overview
I used the dplyr package to group your data by the movie column and perform calculations based on the rating column.
In summarise(), I created three new columns:
Total_Review: counts the total number of reviews per movie.
FourPlus_Rating: counts the subset of reviews with a Rating value of 4 or higher.
Per_FourPlus_Rating: divides FourPlus_Rating by Total_Review.
I then arranged the date in descending order based on Per_FourPlus_Rating. Finally, I called head() to specify that I only want the tibble to return the first 5 rows.
Reproducible Example
# install necessary package
install.packages( pkgs = "dplyr" )
# load necessary package
library( dplyr )
# view first six rows
head( x = df )
# Rating movie
# 1 1 Star Wars IV - A New Hope
# 2 5 Star Wars IV - A New Hope
# 5 4 Star Wars IV - A New Hope
# 6 2 Star Wars IV - A New Hope
# 8 4 Star Wars IV - A New Hope
# 9 5 Star Wars IV - A New Hope
# perform calculations using
# dplyr functions
df %>%
group_by( movie ) %>%
summarise( Total_Review = n()
, FourPlus_Rating = length( Rating[ which( Rating >= 4 ) ] )
, Per_FourPlus_Rating = length( Rating[ which( Rating >= 4 ) ] ) / n() ) %>%
arrange( desc( Per_FourPlus_Rating ) ) %>%
head( n = 5 )
# A tibble: 5 x 4
# movie Total_Review FourPlus_Rating Per_FourPlus_Rati…
# <fct> <int> <int> <dbl>
# 1 The Shawshank Rede… 10 7 0.700
# 2 Star Wars IV - A N… 15 8 0.533
# 3 Gladiator 12 6 0.500
# 4 Blade Runner 9 4 0.444
# 5 The Silence of the… 16 7 0.438
# end of script #
this is a dplyr solution:
dfAuxhigh=filter(dfAux1,Rating>=4)%>%group_by(movie)%>%summarize(percentHigh=n())
dfAux=dfAux1%>%group_by(movie)%>%summarize(percentAll=n())
result<-merge(dfAuxhigh,dfAux,by="movie")%>%mutate(percentage=percentHigh/percentAll)
result<-result[order(result$percentage,decreasing = T)[1:5],c(1,4)]
library(tidyverse)
df %>%
group_by(movie, Rating) %>%
summarise(n = n()) %>% #< get freq of movies
mutate(freq = n/sum(n)) %>% #< find perc for each rating, by movie
filter(Rating >=4) %>% #< filter for desired rating (4 or above)
summarise(freq = sum(freq)) %>% #< summarize again
top_n(5) %>%
arrange(desc(freq)) %>%
mutate(freq = paste0(round(freq*100, 2), "%"))
#> movie freq
#> 1 The Shawshank Redemption 70%
#> 2 Star Wars IV - A New Hope 53.33%
#> 3 Gladiator 50%
#> 4 Blade Runner 44.44%
#> 5 The Silence of the Lambs 43.75%
This is very basic, but i have been stuck on this for a while now.
I want to remove the observation -Steven Sax from Dataset hitters.txt:
> dput(hitters[280:290,])
structure(list(AtBat = c(439L, 453L, 528L, 633L, 16L, 562L, 281L,
593L, 687L, 368L, 263L), Hits = c(96L, 103L, 122L, 210L, 2L,
169L, 76L, 152L, 213L, 103L, 70L), HmRun = c(0L, 8L, 1L, 6L,
0L, 17L, 3L, 23L, 10L, 3L, 1L), Runs = c(44L, 53L, 67L, 91L,
1L, 88L, 42L, 69L, 91L, 48L, 26L), RBI = c(36L, 33L, 45L, 56L,
0L, 73L, 25L, 75L, 65L, 28L, 23L), Walks = c(65L, 52L, 51L, 59L,
0L, 53L, 20L, 53L, 27L, 54L, 30L), Years = c(4L, 2L, 4L, 6L,
2L, 8L, 8L, 6L, 4L, 8L, 4L), CAtBat = c(711L, 507L, 1716L, 3070L,
28L, 3181L, 2658L, 2765L, 1518L, 1897L, 888L), CHits = c(148L,
123L, 403L, 872L, 4L, 841L, 657L, 686L, 448L, 493L, 220L), CHmRun = c(1L,
8L, 12L, 19L, 0L, 61L, 48L, 133L, 15L, 9L, 9L), CRuns = c(68L,
63L, 211L, 420L, 1L, 450L, 324L, 369L, 196L, 207L, 83L), CRBI = c(56L,
39L, 146L, 230L, 0L, 342L, 300L, 384L, 137L, 162L, 82L), CWalks = c(99L,
58L, 155L, 274L, 0L, 373L, 179L, 321L, 89L, 198L, 86L), League = structure(c(2L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L), .Label = c("A", "N"), class = "factor"),
Division = structure(c(1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L,
2L, 1L), .Label = c("E", "W"), class = "factor"), PutOuts = c(229L,
289L, 209L, 367L, 247L, 351L, 106L, 315L, 294L, 209L, 81L
), Assists = c(406L, 407L, 372L, 432L, 4L, 442L, 144L, 10L,
445L, 246L, 147L), Errors = c(22L, 6L, 17L, 16L, 8L, 17L,
7L, 6L, 13L, 3L, 4L), Salary = c(150, 105, 350, 90, NA, 530,
341.667, 940, 350, 326.667, 250), NewLeague = structure(c(2L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L), .Label = c("A",
"N"), class = "factor")), .Names = c("AtBat", "Hits", "HmRun",
"Runs", "RBI", "Walks", "Years", "CAtBat", "CHits", "CHmRun",
"CRuns", "CRBI", "CWalks", "League", "Division", "PutOuts", "Assists",
"Errors", "Salary", "NewLeague"), row.names = c("-Steve Jeltz",
"-Steve Lombardozzi", "-Spike Owen", "-Steve Sax", "-Tony Armas",
"-Tony Bernazard", "-Tom Brookens", "-Tom Brunansky", "-Tony Fernandez",
"-Tim Flannery", "-Tom Foley"), class = "data.frame")
If i knew the name of the first column i would have used:
hitters <- hitters[!hitters$Colname == "-Steve Sax",]
or
hitters <- hitters[hitters$AtBat != "-Steve Sax", ]
But i don't know the name of the first column:
. I have tried: read.table("hitters.txt", head = F)
`and
read.table("hitters.txt", head = F)
My questions are:
How can I remove the observation?
Why head = T didnt work?
The first "column" represents the row names (this is not an actual column in the data set but appears as such in the output). You can access row names with the function rownames:
hitters[!rownames(hitters) %in% '-Steve Sax', ]
will extract the observation from the data set.
Output:
AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns
-Steve Jeltz 439 96 0 44 36 65 4 711 148 1 68
-Steve Lombardozzi 453 103 8 53 33 52 2 507 123 8 63
-Spike Owen 528 122 1 67 45 51 4 1716 403 12 211
-Tony Armas 16 2 0 1 0 0 2 28 4 0 1
-Tony Bernazard 562 169 17 88 73 53 8 3181 841 61 450
-Tom Brookens 281 76 3 42 25 20 8 2658 657 48 324
-Tom Brunansky 593 152 23 69 75 53 6 2765 686 133 369
-Tony Fernandez 687 213 10 91 65 27 4 1518 448 15 196
-Tim Flannery 368 103 3 48 28 54 8 1897 493 9 207
-Tom Foley 263 70 1 26 23 30 4 888 220 9 83
I have this dataset. I want to make side-by-side boxplots of only those movies whose indexes appear 67 times in the "movie" column. The following code tells me the indexes that appear 67 times in the "movie" column:
names(which(table(votes$movie) == 67))
But how can I make side-by-side boxplots for the "rating" each of these indexes? And how can I also add the averages as a single point on each of those boxplots?
I have tried:
boxplot(votes$rating[which(table(votes$movie) == 67)])
But this is clearly wrong, as it shows me only one boxplot
MRE:
# set.seed(1)
# votes2 <- votes[sample(1:nrow(votes), 100, TRUE), ]
votes2 <-
structure(list(user = c(869L, 620L, 42L, 341L, 930L, 267L, 708L,934L, 148L, 385L, 251L, 181L, 313L, 437L, 747L, 260L, 109L, 201L,229L, 366L, 921L, 829L, 934L, 868L, 321L, 226L, 527L, 726L, 26L,457L, 117L, 325L, 327L, 60L, 804L, 158L, 593L, 200L, 880L, 482L,868L, 339L, 328L, 347L, 100L, 896L, 846L, 676L, 357L, 496L, 541L,807L, 257L, 924L, 894L, 478L, 601L, 13L, 311L, 230L, 435L, 654L,742L, 180L, 887L, 201L, 147L, 326L, 749L, 465L, 727L, 200L, 216L,267L, 345L, 445L, 268L, 26L, 366L, 82L, 763L, 436L, 324L, 707L,802L, 280L, 682L, 343L, 826L, 325L, 508L, 618L, 405L, 655L, 645L,378L, 296L, 438L, 450L, 151L), movie = c(181L, 240L, 410L, 948L,143L, 926L, 1054L, 502L, 474L, 47L, 147L, 125L, 527L, 249L, 659L,319L, 576L, 1426L, 245L, 672L, 1028L, 151L, 492L, 90L, 182L,250L, 7L, 248L, 841L, 222L, 307L, 434L, 318L, 132L, 746L, 510L,692L, 79L, 585L, 269L, 739L, 485L, 679L, 386L, 347L, 686L, 12L,303L, 597L, 532L, 304L, 820L, 285L, 173L, 52L, 71L, 208L, 333L,504L, 266L, 961L, 195L, 294L, 216L, 491L, 179L, 304L, 655L, 62L,855L, 222L, 756L, 226L, 217L, 303L, 902L, 825L, 255L, 671L, 1128L,283L, 568L, 259L, 212L, 646L, 144L, 566L, 88L, 174L, 99L, 172L,44L, 482L, 863L, 674L, 696L, 292L, 269L, 722L, 443L), rating = c(3L,5L, 3L, 3L, 2L, 2L, 3L, 4L, 5L, 4L, 3L, 3L, 4L, 5L, 4L, 2L, 3L,2L, 3L, 5L, 4L, 4L, 4L, 3L, 3L, 4L, 5L, 2L, 2L, 5L, 5L, 5L, 5L,4L, 4L, 3L, 3L, 5L, 1L, 4L, 2L, 5L, 2L, 1L, 4L, 3L, 5L, 4L, 4L,5L, 4L, 3L, 5L, 5L, 4L, 3L, 4L, 3L, 4L, 4L, 1L, 4L, 3L, 5L, 2L,5L, 5L, 5L, 3L, 4L, 3L, 3L, 3L, 4L, 4L, 4L, 3L, 3L, 5L, 1L, 4L,5L, 5L, 4L, 4L, 2L, 3L, 4L, 5L, 5L, 5L, 4L, 3L, 3L, 3L, 3L, 5L,4L, 5L, 5L),
timestamp = structure(c(884490825, 889987954, 881110483,890758169, 879535462, 878970785, 877326158, 891194539, 877019882,879441982, 886272319, 878962816, 891013525, 880142027, 888639175,890618198, 880580663, 884114015, 891632385, 888858078, 879380142,891990672, 891192087, 877109874, 879439679, 883890491, 879456162,889832422, 891380200, 882392853, 880124339, 891478376, 887820828,883325944, 879444890, 880134296, 886193724, 884128499, 880175050,887643096, 877111542, 891032413, 885049460, 881654846, 891375212,887159146, 883947777, 892685403, 878952080, 876072633, 883864207,892532068, 882049950, 885458060, 882404507, 889388790, 876350017,881514810, 884364873, 880484286, 884133635, 887864350, 881005590,877128388, 881379566, 884114471, 885593942, 879875432, 878849052,883531444, 883709350, 876042493, 880244803, 878973760, 884900448,891200870, 875742893, 891377609, 888857990, 884714361, 878915600,887769416, 880575107, 886286792, 875986155, 891700514, 888519260,876405130, 885690481, 891479244, 883767157, 891308791, 885544739,887473995, 892054402, 880045044, 884196057, 879867960, 882471524,879524947), class = c("POSIXct","POSIXt"), tzone = "")),
.Names = c("user","movie", "rating", "timestamp"), row.names = c(26551L, 37213L,57286L, 90821L, 20169L, 89839L, 94468L, 66080L, 62912L, 6179L,20598L, 17656L, 68703L, 38411L, 76985L, 49770L, 71762L, 99191L,38004L, 77745L, 93471L, 21215L, 65168L, 12556L, 26723L, 38612L,1340L, 38239L, 86970L, 34035L, 48209L, 59957L, 49355L, 18622L,82738L, 66847L, 79424L, 10795L, 72372L, 41128L, 82095L, 64707L,78294L, 55304L, 52972L, 78936L, 2334L, 47724L, 73232L, 69274L,47762L, 86121L, 43810L, 24480L, 7068L, 9947L, 31628L, 51864L,66201L, 40684L, 91288L, 29361L, 45907L, 33240L, 65088L, 25802L,47855L, 76632L, 8425L, 87533L, 33908L, 83945L, 34669L, 33378L,47636L, 89220L, 86434L, 38999L, 77733L, 96062L, 43466L, 71252L,40000L, 32536L, 75709L, 20270L, 71113L, 12170L, 24549L, 14331L,23963L, 5894L, 64229L, 87627L, 77892L, 79731L, 45528L, 41009L,
81088L, 60494L), class = "data.frame")
names(which(table(votes2$movie) == 2))
# [1] "222" "269" "303" "304"
boxplot(votes2$rating[which(table(votes2$movie) == 2)])
Perhaps, as I understand the request as being for the side by side boxplots for ratings where the votes are exactly 67 in number:
boxplot( movie ~ rating, data=votes,
subset = movie %in% names( table(votes$movie) == 67)), 'rating' ])
Switched the order in my first guess at correct formula but testing should this was more successful on your example:
boxplot(movie~rating, data=votes2, subset = movie %in% names( table(votes2$movie) == 2))
You should probably do a search on rhelp and SO for plotting a point or text for the mean of categories on boxplots. Pretty sure this has been asked before. If unsuccessful, then report on the search terms used.