Having issues running group comparison Shapiro-Wilks test for RMANOVA - r

I'm currently using the "weightloss" dataset from the datarium package to start running an RMANOVA. Here is the dput:
dput(head(weightloss))
structure(list(id = structure(1:6, .Label = c("1", "2", "3",
"4", "5", "6", "7", "8", "9", "10", "11", "12"), class = "factor"),
diet = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("no",
"yes"), class = "factor"), exercises = structure(c(1L, 1L,
1L, 1L, 1L, 1L), .Label = c("no", "yes"), class = "factor"),
t1 = c(10.43, 11.59, 11.35, 11.12, 9.5, 9.5), t2 = c(13.21,
10.66, 11.12, 9.5, 9.73, 12.74), t3 = c(11.59, 13.21, 11.35,
11.12, 12.28, 10.43)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
So this is the script I have come up with so far:
# Create Data Frame for Dataset:
weight <- weightloss
weight
# Pivot Longer Data to Create Factors and Scores:
weight <- weight %>%
pivot_longer(names_to = 'trial', # creates factor (x)
values_to = 'value', # creates value (y)
cols = t1:t3) # finds which cols to factor
# Plot Means in Boxplot:
ggplot(weight,
aes(x=trial,y=value))+
geom_boxplot()+
labs(title = "Trial Means") # As can be predicted, inc w/time
I get this pretty normal looking boxplot:
Now its time to find outliers and test for normality.
# Identify Outliers (Should be None Given Boxplot):
outlier <- weight %>%
group_by(trial) %>%
identify_outliers(value)
outlier_frame <- data.frame(outlier)
outlier_frame # none found :)
# Normality (Shapiro-Wilk and QQPlot):
model <- lm(value~trial,
data = weight) # creates model
shapiro_test(residuals(model)) # measures Shapiro
ggqqplot(residuals(model))+
labs(title = "QQ Plot of Residuals") # creates QQ
This again gives me a pretty normal QQplot:
I then wrapped the data by trial:
ggqqplot(weight, "value", ggtheme = theme_bw())+
facet_wrap(~trial)+
labs(title = "QQPlot of Each Trial") #looks normal
And it comes out right from what I can tell:
However, when I try to do a Shapiro Wilk test by group, I keep having issues with this code:
shapiro_group <- weight %>%
group_by(trial) %>%
shapiro_test(value)
It gives me this error:
Error: Problem with mutate() column data. i data = map(.data$data, .f, ...). x Must group by variables found in .data.
Column variable is not found.
I also tried this:
shapiro_test(weight, trial$value)
And get this error instead:
Error: Can't subset columns that don't exist. x Column trial$value
doesn't exist.
If anybody has some insight as to why, I would greatly appreciate it!

The reason you were getting an error for shapiro_test was because the implementation of it has this one line in it.
shapiro_test
function (data, ..., vars = NULL)
{
....
....
data <- data %>% gather(key = "variable", value = "value") %>%
filter(!is.na(value))
....
....
}
where it gets the data in long format using gather. Since you already have a column named value this doesn't work.
If you change the name of value column to anything else it works.
library(dplyr)
library(rstatix)
weight %>%
rename(value1 = value) %>%
group_by(trial) %>%
shapiro_test(value1)
# trial variable statistic p
# <chr> <chr> <dbl> <dbl>
#1 t1 value1 0.869 0.222
#2 t2 value1 0.910 0.440
#3 t3 value1 0.971 0.897

Related

Looping pipe operator code through multiple Dataframe in R

does anyone know how I can loop pipe operator code through multiple dataframe?
I've quite a few dataframe named over the years (df_1990, df_1991 ... df_2020). However, not all years are included, (i.e. df_1993, df_2012 and 3 more years are not available). To account for this, I manually created a list to store all the data frame for the looping (do enlighten me if there's a faster way for this).
df_list = list(df_1990, df_1991, ..., df_2020)
for (i in df_list) {
...
}
The dataframes are pretty simple with just 2 columns (Item (character field) & Cost (numeric field).
Item
Cost
Book_A
3.00
Book_B
5.00
...
...
a sample code for the dataframe
df = structure(list(Item = structure(c(1L, 1L, 1L, 2L, 2L, 3L, 2L,
3L, 1L, 2L, 1L, 2L, 1L, 3L, 1L, 2L, 2L, 1L, 3L, 1L), .Label = c("Book A",
"Book B", "Book C"), class = "factor"), Cost = c(5, 3.5, 12,
6, 8, 3, 6, 3.5, 3.8, 13, 5.1, 7, 11.5, 3.8, 5.5, 6.5, 13.5,
5.5, 3.5, 1.2)), class = "data.frame", row.names = c(NA, -20L
))
Does anyone know how I can add in the following code into the ... portion of the for loop code above? Thank you!
df %>%
group_by(Item) %>%
summarise(outlier = mean(Cost),
offset = outlier * 0.6,
higher_value = outlier + offset,
lower_value = outlier - offset) %>%
left_join(df, by = 'Item') %>%
transmute(Item, Cost, Outlier = ifelse(Cost < lower_value | Cost > higher_value, 'Y', 'N'))
The code basically detect the outlier (for e.g. if the cost is 60% higher or lower than majority average of the particular item) and output a column of "Y" and "N" for each row respectively. (Credits for the code goes to Ronak Shah)
Ideally the new column created should appear in the list created to allow exporting to excel format
Thank you!
Personally I would move the data wrangling code in a function and would then use lapply to loop over your list of data frames.
library(dplyr)
df_list <- list(df, df, df)
prep_data <- function(x) {
x %>%
group_by(Item) %>%
summarise(
outlier = mean(Cost),
offset = outlier * 0.6,
higher_value = outlier + offset,
lower_value = outlier - offset
) %>%
left_join(x, by = "Item") %>%
transmute(Item, Cost, Outlier = ifelse(Cost < lower_value | Cost > higher_value, "Y", "N"))
}
df_prep <- lapply(df_list, prep_data)
lapply(df_prep, head, 2)
#> [[1]]
#> # A tibble: 2 × 3
#> Item Cost Outlier
#> <fct> <dbl> <chr>
#> 1 Book A 5 N
#> 2 Book A 3.5 N
#>
#> [[2]]
#> # A tibble: 2 × 3
#> Item Cost Outlier
#> <fct> <dbl> <chr>
#> 1 Book A 5 N
#> 2 Book A 3.5 N
#>
#> [[3]]
#> # A tibble: 2 × 3
#> Item Cost Outlier
#> <fct> <dbl> <chr>
#> 1 Book A 5 N
#> 2 Book A 3.5 N
If you want to do it via a for loop then you could achieve the same result like so:
df_prep <- list()
for (i in seq_along(df_list)) {
df_prep[[i]] <- prep_data(df_list[[i]])
}
Why don't you put all your data into one dataframe:
df_list = list(df_1990 = df_1990, df_1991 = df_1991, ..., df_2020 = df_2020)
df2 = dplyr::bind_rows(df_list, .id = 'Year')
then you only have to add the variable Year into the group_by statement:
group_by(Year, Item)
If you need to, you can always convert it back to a list of dataframes:
df2 %>%
tidyr::nest(data = Item:Cost) %>%
pull(data, name = Year)
Btw, you can also improve the code for the outlier detection, by omitting the join:
df2 %>%
group_by(Year, Item) %>%
mutate(outlier = mean(Cost),
offset = outlier * 0.6,
higher_value = outlier + offset,
lower_value = outlier - offset) %>%
transmute(Item, Cost, Outlier = if_else(Cost < lower_value | Cost > higher_value, 'Y', 'N'))
using mutate instead of summarise copies the result of mean(Cost) to every row of the group.

The first two columns defined as "rownames"

I want to define the first two columns of a data frame as rownames. Actually I want to do some calculations and the data frame has to be numeric for that.
data.frame <- data_frame(id=c("A1","B2"),name=c("julia","daniel"),BMI=c("20","49"))
The values for BMI are numerical (proved with is.numeric), but the over all data.frame not. How to define the first two columns (id and name) as rownames?
Thank you in advance for any suggestions
You can combine id and name column and then assign rownames
data.frame %>%
tidyr::unite(rowname, id, name) %>%
tibble::column_to_rownames()
# BMI
#A1_julia 20
#B2_daniel 49
In base R, you can do the same in steps as
data.frame <- as.data.frame(data.frame)
rownames(data.frame) <- paste(data.frame$id, data.frame$name, sep = "_")
data.frame[c('id', 'name')] <- NULL
Not sure if the code and result below is the thing you are after:
dfout <- `rownames<-`(data.frame(BMI = as.numeric(df$BMI)),paste(df$id,df$name))
such that
> dfout
BMI
A1 julia 20
B2 daniel 49
DATA
df <- structure(list(id = structure(1:2, .Label = c("A1", "B2"), class = "factor"),
name = structure(2:1, .Label = c("daniel", "julia"), class = "factor"),
BMI = structure(1:2, .Label = c("20", "49"), class = "factor")), class = "data.frame", row.names = c(NA,
-2L))

Get single column of values comparing multiple columns

I have just started my journey with R. I want to test values across multiple columns for the same condition and return 5 if any of the values is "hello" within a row:
result = ifelse((myData[1] == "hello") | (myData[2] == "hello") | (myData[3] == "hello"), 5, 0)
This works fine, but code seems to be redundant. When I do:
resultSec = ifelse(myData[1:3] == "hello", 5, 0)
Then all 3 columns are checked against the condition, but the result I get is not a single column, but 3 columns. So then I would have to perform an additional comparison for all columns which makes totally more lines of code then the first redundant method.
How can I get in this case a one column of values in efficient way ?
You can use the function apply() to iterate over a data.frame or matrix, by either columns or rows. The margin argument determines which one you use.
Here we want to check the rows, so we use margin = 1:
dat <- data.frame(col1 = c("happy", "sad", "mad"),
col2 = c("tired", "sleepy", "happy"),
col3 = c("relaxed", "focused", "fine"))
dat$res <- apply(X = dat, MARGIN = 1,
FUN = function(x) ifelse("happy" %in% x, 5, 0))
dat
col1 col2 col3 res
1 happy tired relaxed 5
2 sad sleepy focused 0
3 mad happy fine 5
We can use rowSums here
df1$res <- rowSums(df1 == "happy") * 5
df1$res
#[1] 5 0 5
data
df1 <- structure(list(col1 = structure(c(1L, 3L, 2L), .Label = c("happy",
"mad", "sad"), class = "factor"), col2 = structure(c(3L, 2L,
1L), .Label = c("happy", "sleepy", "tired"), class = "factor"),
col3 = structure(c(3L, 2L, 1L), .Label = c("fine", "focused",
"relaxed"), class = "factor")), .Names = c("col1", "col2",
"col3"), row.names = c(NA, -3L), class = "data.frame")

Order column names in ascending order within dplyr chain

I have this data.frame:
df <- structure(list(att_number = structure(1:3, .Label = c("0", "1",
"2"), class = "factor"), `1` = structure(c(2L, 3L, 1L), .Label = c("1026891",
"412419", "424869"), class = "factor"), `10` = structure(c(2L,
1L, 3L), .Label = c("235067", "546686", "92324"), class = "factor"),
`2` = structure(c(3L, 1L, 2L), .Label = c("12729", "7569",
"9149"), class = "factor")), .Names = c("att_number", "1",
"10", "2"), row.names = c(NA, -3L), class = "data.frame")
It looks like this having numbers as the column names.
att_number 1 10 2
0 412419 546686 9149
1 424869 235067 12729
2 1026891 92324 7569
Within a dplyr chain, I would like to order the columns in ascending order, like this:
att_number 1 2 10
0 412419 9149 546686
1 424869 12729 235067
2 1026891 7569 7569
I've tried using select_, but it doesn't want to work according to plan. Any idea on how I can do this? Here's my feeble attempt:
names_order <- names(df)[-1] %>%
as.numeric %>%
.[order(.)] %>%
as.character %>%
c('att_number', .)
df %>%
select_(.dots = names_order)
Error: Position must be between 0 and n
Update:
For newer versions of dplyr (>= 0.7.0):
library(tidyverse)
sort_names <- function(data) {
name <- names(data)
chars <- keep(name, grepl, pattern = "[^0-9]") %>% sort()
nums <- discard(name, grepl, pattern = "[^0-9]") %>%
as.numeric() %>%
sort() %>%
sprintf("%s", .)
select(data, !!!c(chars, nums))
}
sort_names(df)
Original:
You need back ticks around the numeric column names to stop select from trying to interpret them as column positions:
library(tidyverse)
sort_names <- function(data) {
name <- names(data)
chars <- keep(name, grepl, pattern = "[^0-9]") %>% sort()
nums <- discard(name, grepl, pattern = "[^0-9]") %>%
as.numeric() %>%
sort() %>%
sprintf("`%s`", .)
select_(data, .dots = c(chars, nums))
}
sort_names(df)

How to save the column names and their corresponding type in R into excel?

i have a R data set with >200 columns. I need to get what class each column is and get that into excel, with col name and its corresponding class as two columns
1. Using lapply/sapply with stack/melt
You could do this using lapply/sapply to get the class of each column and then using stack from base R or melt from reshape2 to get the 2 column data.frame.
res <- stack(lapply(df, class))
#or
library(reshape2)
res1<- melt(lapply(df, class))
Then use write.csv or using any of the specialized libraries for writing to excel data i.e. XLConnect, WriteXLS etc.
write.csv(res, file="file1.csv", row.names=FALSE, quote=FALSE)
.csv files can be opened in excel
2. From the output of str
Or you could use capture.output and regex to get the required info from the str and convert it to data.frame using read.table
v1 <- capture.output(str(df))
v2 <- grep("\\$", v1, value=TRUE)
res2 <- read.table(text=gsub(" +\\$ +(.*)\\: +([A-Za-z]+) +.*", "\\1 \\2", v2),
sep="",header=FALSE,stringsAsFactors=FALSE)
head(res2,2)
# V1 V2
#1 t02.clase Factor
#2 Std_A_CLI_monto_sucursal_1 chr
data
df <-structure(list(t02.clase = structure(c(1L, 1L, 1L), .Label = "AK",
class = "factor"),Std_A_CLI_monto_sucursal_1 = c("0", "0", "0"),
Std_A_CLI_monto_sucursal_2 = c(0, 0.01303586, 0), Std_A_CLI_monto_sucursal_3 =
c(0.051311597, 0.003442244, 0.017347593), Std_A_CLI_monto_sucursal_4 = c(0L,
0L, 0L), Std_A_CLI_promociones = c(0.4736842, 0.5, 0), Std_A_CLI_dias_cliente =
c(0.57061341, 0.55492154, 0.05991441), Std_A_CLI_sucursales = c(0.05555556,
0.05555556, 0.05555556)), .Names = c("t02.clase", "Std_A_CLI_monto_sucursal_1",
"Std_A_CLI_monto_sucursal_2", "Std_A_CLI_monto_sucursal_3",
"Std_A_CLI_monto_sucursal_4", "Std_A_CLI_promociones", "Std_A_CLI_dias_cliente",
"Std_A_CLI_sucursales"), row.names = c("1", "2", "3"), class = "data.frame")

Resources