R Pivot Longer With Multiple Columns

R Pivot Longer With Multiple Columns - r

HAVE = data.frame( COURSE =c( 1, 1, 1, 2, 2, 2, 3, 3, 3 ),
STUDENT =c( 'A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'C' ),
FISH =c( 4, 8, 9, 1, 7, 1, 10, 10, 10 ),
CAT =c( 9, 8, 10, 7, 1, 2, 8, 0, 2 ),
FOX =c( 7, NA, 9, 0, NA, 10, 5, NA, 10 ),
BUNNIE =c( 6, NA, 0, 5, NA, 6, 4, NA, 1 ),
RABBIT =c( 2, NA, 0, 6, NA, 8, 3, NA, 0 ))
WANT = data.frame( COURSE =c( 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3 ),
TEST =c( 'FISH', 'CAT', 'FOX', 'BUNNIE', 'RABBIT', 'FISH', 'CAT', 'FOX', 'BUNNIE', 'RABBIT', 'FISH', 'CAT', 'FOX', 'BUNNIE', 'RABBIT' ),
A =c( 4, 9, 7, 6, 2, 1, 7, 0, 5, 6, 10, 8, 5, 4, 3 ),
B =c( 8, 8, NA, NA, NA, 7, 1, NA, NA, NA, 10, 0, NA, NA, NA ),
C =c( 9, 10, 9, 0, 0, 1, 2, 10, 6, 8, 10, 2, 10, 1, 0 ))
I try:
WANT = HAVE %>% pivot_longer(FISH:RABBIT, names_to = "TEST", values_to = A:C) with no success

Basically you want to gather the animals names into a single column named "TEST", and then expand the student names in several columns. So you need two steps:
pivot_longer() where you gather the animals names
pivot_wider() where you expand the student names
library(tidyr)
HAVE = data.frame( COURSE =c( 1, 1, 1, 2, 2, 2, 3, 3, 3 ),
STUDENT =c( 'A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'C' ),
FISH =c( 4, 8, 9, 1, 7, 1, 10, 10, 10 ),
CAT =c( 9, 8, 10, 7, 1, 2, 8, 0, 2 ),
FOX =c( 7, NA, 9, 0, NA, 10, 5, NA, 10 ),
BUNNIE =c( 6, NA, 0, 5, NA, 6, 4, NA, 1 ),
RABBIT =c( 2, NA, 0, 6, NA, 8, 3, NA, 0 ))
out <- HAVE |>
pivot_longer(
cols = c("FISH", "CAT", "FOX", "BUNNIE", "RABBIT"),
names_to = "TEST"
) |>
pivot_wider(
names_from = "STUDENT",
values_from = "value"
)
out
#> # A tibble: 15 × 5
#> COURSE TEST A B C
#> <dbl> <chr> <dbl> <dbl> <dbl>
#> 1 1 FISH 4 8 9
#> 2 1 CAT 9 8 10
#> 3 1 FOX 7 NA 9
#> 4 1 BUNNIE 6 NA 0
#> 5 1 RABBIT 2 NA 0
#> 6 2 FISH 1 7 1
#> 7 2 CAT 7 1 2
#> 8 2 FOX 0 NA 10
#> 9 2 BUNNIE 5 NA 6
#> 10 2 RABBIT 6 NA 8
#> 11 3 FISH 10 10 10
#> 12 3 CAT 8 0 2
#> 13 3 FOX 5 NA 10
#> 14 3 BUNNIE 4 NA 1
#> 15 3 RABBIT 3 NA 0
Check that the result is what is expected:
WANT = data.frame( COURSE =c( 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3 ),
TEST =c( 'FISH', 'CAT', 'FOX', 'BUNNIE', 'RABBIT', 'FISH', 'CAT', 'FOX', 'BUNNIE', 'RABBIT', 'FISH', 'CAT', 'FOX', 'BUNNIE', 'RABBIT' ),
A =c( 4, 9, 7, 6, 2, 1, 7, 0, 5, 6, 10, 8, 5, 4, 3 ),
B =c( 8, 8, NA, NA, NA, 7, 1, NA, NA, NA, 10, 0, NA, NA, NA ),
C =c( 9, 10, 9, 0, 0, 1, 2, 10, 6, 8, 10, 2, 10, 1, 0 ))
identical(out, as_tibble(WANT))
#> [1] TRUE
Created on 2022-10-05 with reprex v2.0.2

Related

variable based on other variables in R

I have a df like this
my_df <- data.frame(
b1 = c(2, 6, 3, 6, 4, 2, 1, 9, NA),
b2 = c(100, 4, 106, 102, 6, 6, 1, 1, 7),
b3 = c(75, 79, 8, 0, 2, 3, 9, 5, 80),
b4 = c(NA, 6, NA, 10, 12, 8, 3, 6, 2),
b5 = c(2, 12, 1, 7, 8, 5, 5, 6, NA),
b6 = c(9, 2, 4, 6, 7, 6, 6, 7, 9),
b7 = c(1, 3, 7, 7, 4, 2, 2, 9, 5),
b8 = c(NA, 8, 4, 5, 1, 4, 1, 3, 6),
b9 = c(4, 5, 7, 9, 5, 1, 1, 2, NA),
b10 = c(14, 2, 4, 2, 1, 1, 1, 1, 5))
I want to create a new column (NEW) which says BLUE or RED based on columns b2 and b3. so, if column b2 is Greater than or equal to 100 0R b3 is Greater than or equal to 75, then input BLUE otherwise input RED.
So that I will have something like this:
my_df <- data.frame(
b1 = c(2, 6, 3, 6, 4, 2, 1, 9, NA),
b2 = c(100, 4, 106, 102, 6, 6, 1, 1, 7),
b3 = c(75, 79, 8, 0, 2, 3, 9, 5, 80),
b4 = c(NA, 6, NA, 10, 12, 8, 3, 6, 2),
b5 = c(2, 12, 1, 7, 8, 5, 5, 6, NA),
b6 = c(9, 2, 4, 6, 7, 6, 6, 7, 9),
b7 = c(1, 3, 7, 7, 4, 2, 2, 9, 5),
b8 = c(NA, 8, 4, 5, 1, 4, 1, 3, 6),
b9 = c(4, 5, 7, 9, 5, 1, 1, 2, NA),
b10 = c(14, 2, 4, 2, 1, 1, 1, 1, 5),
NEW = c("BLUE", "BLUE", "BLUE", "BLUE", "RED", "RED", "RED", "RED", "BLUE"))
I have been able to work this out using this:
library (tidyverse)
greater_threshold <- 99.9
greater_threshold1 <- 74.9
my_df1 <- my_df %>%
mutate(NEW = case_when(b2 > greater_threshold ~ "BLUE",
b3 > greater_threshold1 ~ "BLUE",
+ T~"RED"))
At the moment, you can see that I am setting my 'greater threshold' to be slightly less than the required value. Although it works well. My question is this. Is there a way I set set my 'greater threshold to be ≥ 100 for b2 and ≥ 75 for b3.

For this example, I'd go whit if_else instead of case_when:
library(dplyr)
greater_threshold <- 100
greater_threshold1 <- 75
my_df <- data.frame(
b1 = c(2, 6, 3, 6, 4, 2, 1, 9, NA),
b2 = c(100, 4, 106, 102, 6, 6, 1, 1, 7),
b3 = c(75, 79, 8, 0, 2, 3, 9, 5, 80),
b4 = c(NA, 6, NA, 10, 12, 8, 3, 6, 2),
b5 = c(2, 12, 1, 7, 8, 5, 5, 6, NA),
b6 = c(9, 2, 4, 6, 7, 6, 6, 7, 9),
b7 = c(1, 3, 7, 7, 4, 2, 2, 9, 5),
b8 = c(NA, 8, 4, 5, 1, 4, 1, 3, 6),
b9 = c(4, 5, 7, 9, 5, 1, 1, 2, NA),
b10 = c(14, 2, 4, 2, 1, 1, 1, 1, 5)
)
my_df1 <- my_df %>%
mutate(
NEW = if_else(
b2 >= greater_threshold | b3 >= greater_threshold1,
"BLUE",
"RED"
)
)
my_df1
# b1 b2 b3 b4 b5 b6 b7 b8 b9 b10 NEW
# 1 2 100 75 NA 2 9 1 NA 4 14 BLUE
# 2 6 4 79 6 12 2 3 8 5 2 BLUE
# 3 3 106 8 NA 1 4 7 4 7 4 BLUE
# 4 6 102 0 10 7 6 7 5 9 2 BLUE
# 5 4 6 2 12 8 7 4 1 5 1 RED
# 6 2 6 3 8 5 6 2 4 1 1 RED
# 7 1 1 9 3 5 6 2 1 1 1 RED
# 8 9 1 5 6 6 7 9 3 2 1 RED
# 9 NA 7 80 2 NA 9 5 6 NA 5 BLUE

Remove rows from list of dataframes based on condition

I have a list of dataframes. It looks something like this:
df1 <- data.frame(Var1 = c(1, 7, 9, 4, 2),
Var2 = c(7, 2, 4, 4, 3),
Var3 = c(3, 6, 2, 0, 8))
df2 <- data.frame(Var1 = c(5, 6, 2, 2, 1),
Var2 = c(8, 6, 6, 7, 4),
Var3 = c(9, 0, 1, 3, 4))
df3.wxyz <- data.frame(Var1 = c("w", "x", "y", "z", 3, 7, 3, 6, 6),
Var2 = c(NA, NA, NA, NA, 7, 5, 8, 0, 2),
Var3 = c(NA, NA, NA, NA, 3, 3, 4, 1, 9))
df4 <- data.frame(Var1 = c(2, 7, 2, 4, 8),
Var2 = c(8, 3, 1, 7, 3),
Var3 = c(9, 1, 1, 6, 5))
df5.wxyz <- data.frame(Var1 = c("w", "x", "y", "z", 2, 7, 3, 1, 6),
Var2 = c(NA, NA, NA, NA, 7, 4, 8, 1, 9),
Var3 = c(NA, NA, NA, NA, 8, 0, 4, 1, 2))
df.list <- list(df1, df2, df3.wxyz, df4, df5.wxyz)
names(df.list) <- c("df1", "df2", "df3.wxyz", "df4", "df5.wxyz")
I would like to remove the first 4 rows of df3.wxyz and df5.wxyz from the list of dataframes as those contain information that I do not need. What I've tried is the following code, but instead of only removing the first 4 rows in df3.wxyz and df5.wxyz, it is removing the first 4 rows from every dataframe in my list. I'm not sure what the issue is.
df.list <- lapply(df.list, function(i){
ifelse(grepl("wxyz", names(df.list)), i <- i[-c(1:4), ], df.list)
i
})
This is what I would like to achieve:
df1 <- data.frame(Var1 = c(1, 7, 9, 4, 2),
Var2 = c(7, 2, 4, 4, 3),
Var3 = c(3, 6, 2, 0, 8))
df2 <- data.frame(Var1 = c(5, 6, 2, 2, 1),
Var2 = c(8, 6, 6, 7, 4),
Var3 = c(9, 0, 1, 3, 4))
df3.wxyz <- data.frame(Var1 = c(3, 7, 3, 6, 6),
Var2 = c(7, 5, 8, 0, 2),
Var3 = c(3, 3, 4, 1, 9))
df4 <- data.frame(Var1 = c(2, 7, 2, 4, 8),
Var2 = c(8, 3, 1, 7, 3),
Var3 = c(9, 1, 1, 6, 5))
df5.wxyz <- data.frame(Var1 = c(2, 7, 3, 1, 6),
Var2 = c(7, 4, 8, 1, 9),
Var3 = c(8, 0, 4, 1, 2))
df.list <- list(df1, df2, df3.wxyz, df4, df5.wxyz)
names(df.list) <- c("df1", "df2", "df3.wxyz", "df4", "df5.wxyz")

You can try,
df.list[grepl('wxyz', names(df.list))] <- lapply(df.list[grepl('wxyz', names(df.list))], na.omit)

You can try na.omit like below
> Map(na.omit,df.list)
$df1
Var1 Var2 Var3
1 1 7 3
2 7 2 6
3 9 4 2
4 4 4 0
5 2 3 8
$df2
Var1 Var2 Var3
1 5 8 9
2 6 6 0
3 2 6 1
4 2 7 3
5 1 4 4
$df3.wxyz
Var1 Var2 Var3
5 3 7 3
6 7 5 3
7 3 8 4
8 6 0 1
9 6 2 9
$df4
Var1 Var2 Var3
1 2 8 9
2 7 3 1
3 2 1 1
4 4 7 6
5 8 3 5
$df5.wxyz
Var1 Var2 Var3
5 2 7 8
6 7 4 0
7 3 8 4
8 1 1 1
9 6 9 2

How to sort each column of a df in descending order regarless of the row order?

I am trying to sort my data in descending or ascending order regardless of the data in the rows. I made a dummy example below:
A <- c(9,9,5,4,6,3,2,NA)
B <- c(9,5,3,4,1,4,NA,NA)
C <- c(1,4,5,6,7,4,2,4)
base <- data.frame(A,B,C)
df <- base
df$A <- sort(df$A,na.last = T)
df$B <- sort(df$B,na.last = T)
df$C <- sort(df$C)
We get this
structure(list(A = c(2, 3, 3, 4, 4, 4, 5, 5, 6, 9, 9, NA), B = c(1,
2, 3, 4, 4, 4, 5, 5, 9, 10, NA, NA), C = c(1, 2, 3, 4, 4, 4,
5, 5, 6, 7, 8, 8)), row.names = c(NA, -12L), class = "data.frame")
I want to get something similar to df but my data have hundreds of columns, is there an easier way to do it?
I tried arrange_all() but the result is not what i want.
library(tidyverse)
test <- base%>%
arrange_all()
Obtaining this:
structure(list(A = c(2, 3, 3, 4, 4, 4, 5, 5, 6, 9, 9, NA), B = c(NA,
2, 4, 4, 5, 10, 3, 4, 1, 5, 9, NA), C = c(2, 3, 4, 6, 8, 5, 5,
8, 7, 4, 1, 4)), class = "data.frame", row.names = c(NA, -12L
))

You can sort each column individually :
library(dplyr)
base %>% mutate(across(.fns = sort, na.last = TRUE))
# A B C
#1 2 1 1
#2 3 3 2
#3 4 4 4
#4 5 4 4
#5 6 5 4
#6 9 9 5
#7 9 NA 6
#8 NA NA 7
Or in base R :
base[] <- lapply(base, sort, na.last = TRUE)

Tried code in R with mutate_at and max() functions with own data. Warning messages come up: no non-missing arguments to max

I'm curretly learning R with a book and was trying a mutate_at function from dplyr. In this example I want to standardize the survey items on a scale from 0 to 1. To do this, we can divide each value by the (theoretical) maximum value of the scale.
The book example stats_test from the package "pradadata" works perfectly fine:
data(stats_test, package = "pradadata")
stats_test %>%
drop_na() %>%
mutate_at(.vars = vars(study_time, self_eval, interest),
.funs = funs(prop = ./max(.))) %>%
select(contains("_prop"))
Output:
study_time_prop self_eval_prop interest_prop
<dbl> <dbl> <dbl>
1 0.6 0.7 0.667
2 0.8 0.8 0.833
3 0.6 0.4 0.167
4 0.8 0.7 0.833
5 0.4 0.6 0.5
6 0.4 0.6 0.667
7 0.8 0.6 0.5
8 0.2 0.7 0.667
9 0.6 0.8 0.833
10 0.6 0.7 0.833
# ... with 1,617 more rows
Tried the same code with my own data but it doesn't work and I can't figure out why. The variable RG04 from my data has a range from 1-5. I tried to transform the variable from numeric to integer, because the variables from the the data stats_test are integer too:
df_literacy_2 <- transform(df_literacy, RG04 = as.integer(RG04))
df_literacy_2 <- tibble(df_literacy_2)
df_literacy_2 %>%
drop_na() %>%
mutate_at(.vars = vars(RG04),
.funs = funs(prop = ./max(.))) %>%
select(contains("_prop"))
Output:
# A tibble: 0 x 0
Warning messages:
1: Problem with `mutate()` input `prop`.
i no non-missing arguments to max; returning -Inf
i Input `prop` is `RG04/max(RG04)`.
2: In base::max(x, ..., na.rm = na.rm) :
no non-missing arguments to max; returning -Inf
str(df_literacy_2$RG04)
int [1:630] 2 4 2 1 2 2 1 3 1 3 ...
Why doesn't it work on my data?
Thank you for your help.
Edit with sample of df_literacy:
> dput(head(df_literacy,20))
structure(list(CASE = c(40, 41, 44, 45, 48, 49, 54, 55, 56, 57,
58, 61, 62, 63, 64, 65, 66, 67, 68, 69), SERIAL = c(NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA), REF = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), QUESTNNR = c("base", "base",
"base", "base", "base", "base", "base", "base", "base", "base",
"base", "base", "base", "base", "base", "base", "base", "base",
"base", "base"), MODE = c("interview", "interview", "interview",
"interview", "interview", "interview", "interview", "interview",
"interview", "interview", "interview", "interview", "interview",
"interview", "interview", "interview", "interview", "interview",
"interview", "interview"), STARTED = structure(c(1607290462,
1607290608, 1607291086, 1607291118, 1607291265, 1607291793, 1607294071,
1607294336, 1607294337, 1607294419, 1607294814, 1607296474, 1607301809,
1607329348, 1607333933, 1607335996, 1607336207, 1607336378, 1607343194,
1607343414), tzone = "UTC", class = c("POSIXct", "POSIXt")),
EI01 = structure(c(2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L), .Label = c("Ja",
"Nein", "Nicht beantwortet"), class = "factor"), EI02 = c(2,
2, 2, 1, 1, 2, 1, 2, 2, 2, 2, 1, 2, 2, 1, 1, 1, 1, 2, 3),
RF01 = c(4, 2, 4, 3, 4, 4, 1, 3, 2, 3, 4, 3, 2, 3, 2, 2,
4, 2, 5, 3), RF02 = c(1, 1, 1, 1, 2, 2, 1, 2, 1, 1, 2, 1,
1, 1, 2, 2, 2, 2, 2, 2), RF03 = c(1, 2, 2, 2, 1, 2, 1, 1,
1, 1, 2, 1, 1, 2, 2, 2, 1, 2, 1, 2), RG01 = c(2, 2, 2, 2,
2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2), RG02 = c(3,
3, 3, 3, 4, 3, 4, 2, 4, 2, 3, 4, 4, 2, 4, 3, 4, 3, 4, 4),
RG03 = c(3, 2, 2, 3, 3, 3, 1, 3, 1, 2, 3, 1, 2, 2, 1, 3,
2, 3, 2, 2), RG04 = c(2, 4, 2, 1, 2, 2, 1, 3, 1, 3, 2, 4,
1, 1, 1, 1, 1, 2, 4, 1), RG05 = c(1, 1, 1, 1, 1, 1, 1, 2,
1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1), SD01 = structure(c(2L,
1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 1L, 1L), .Label = c("weiblich", "männlich", "divers",
"nicht beantwortet"), class = "factor"), SD03 = c(4, 3, 2,
2, 1, 2, 4, 4, 1, 4, 3, 1, 2, 3, 2, 4, 2, 3, 1, 3), SD05_01 = c(23,
22, 22, 21, 18, 22, 21, 27, 17, 22, 17, 21, 21, 22, 50, 25,
23, 20, 23, 23), TIME001 = c(2, 3, 23, 73, 29, 2, 3, 3, 29, 7,
50, 55, 3, 2, 10, 2, 1, 5, 7, 35), TIME002 = c(2, 2, 16,
34, 12, 14, 2, 2, 21, 2, 30, 24, 21, 3, 3, 2, 3, 2, 3, 22
), TIME003 = c(34, 8, 12, 15, 13, 12, 12, 7, 13, 11, 16,
10, 11, 16, 8, 8, 7, 8, 11, 14), TIME004 = c(60, 33, 25,
31, 45, 25, 14, 13, 38, 35, 50, 50, 37, 32, 32, 25, 72, 55,
28, 29), TIME005 = c(84, 21, 29, 41, 54, 33, 30, 22, 32,
42, 44, 23, 65, 30, 28, 32, 51, 31, 27, 44), TIME006 = c(14,
9, 27, 11, 24, 8, 8, 9, 18, 12, 35, 33, 27, 46, 11, 15, 8,
14, 12, 14), TIME007 = c(3, 18, 3, 5, 6, 2, 9, 2, 3, 3, 6,
7, 3, 13, 4, 4, 378, 3, 4, 10), TIME_SUM = c(199, 94, 135,
142, 183, 96, 78, 58, 154, 112, 186, 152, 167, 142, 96, 88,
146, 118, 92, 168), MAILSENT = c(NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
LASTDATA = structure(c(1607290661, 1607290702, 1607291221,
1607291328, 1607291448, 1607291889, 1607294149, 1607294394,
1607294491, 1607294531, 1607295045, 1607296676, 1607301976,
1607329490, 1607334030, 1607336084, 1607336727, 1607336496,
1607343286, 1607343582), tzone = "UTC", class = c("POSIXct",
"POSIXt")), FINISHED = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1), Q_VIEWER = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), LASTPAGE = c(7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7),
MAXPAGE = c(7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7), MISSING = c(7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 0, 7, 7, 7), MISSREL = c(1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1), TIME_RSI = c("46023",
"14246", "0.75", "0.63", "0.54", "12055", "17533", "30682",
"0.7", "44197", "0.45", "0.58", "0.83", "44378", "44501",
"18629", "46753", "46388", "44197", "0.57"), DEG_TIME = c(27,
27, 3, 1, 0, 23, 30, 42, 2, 17, 0, 2, 7, 18, 10, 27, 43,
18, 8, 0)), row.names = c(NA, -20L), class = c("tbl_df",
"tbl", "data.frame"))
Edit with TRUE and FALSE NAs:
> sapply(df_literacy, function(a) table(c(T,F,is.na(a)))-1)
CASE SERIAL REF QUESTNNR MODE STARTED EI01 EI02 RF01 RF02 RF03 RG01 RG02 RG03 RG04 RG05 SD01 SD03 SD05_01 TE03_01 TIME001 TIME002 TIME003
FALSE 630 0 0 630 630 630 630 630 630 630 630 630 630 630 630 630 629 629 615 99 630 630 630
TRUE 0 630 630 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 15 531 0 0 0
TIME004 TIME005 TIME006 TIME007 TIME_SUM MAILSENT LASTDATA FINISHED Q_VIEWER LASTPAGE MAXPAGE MISSING MISSREL TIME_RSI DEG_TIME
FALSE 630 630 629 625 630 0 630 630 630 630 630 630 630 630 630
TRUE 0 0 1 5 0 630 0 0 0 0 0 0 0 0 0

There are a few things to correct here.
drop_na() is removing all of your data.
drop_na(df_literacy)
# # A tibble: 0 x 37
# # ... with 37 variables: CASE <dbl>, SERIAL <lgl>, REF <lgl>, QUESTNNR <chr>,
# # MODE <chr>, STARTED <dttm>, EI01 <fct>, EI02 <dbl>, RF01 <dbl>, RF02 <dbl>,
# # RF03 <dbl>, RG01 <dbl>, RG02 <dbl>, RG03 <dbl>, RG04 <dbl>, RG05 <dbl>,
# # SD01 <fct>, SD03 <dbl>, SD05_01 <dbl>, TIME001 <dbl>, TIME002 <dbl>,
# # TIME003 <dbl>, TIME004 <dbl>, TIME005 <dbl>, TIME006 <dbl>, TIME007 <dbl>,
# # TIME_SUM <dbl>, MAILSENT <lgl>, LASTDATA <dttm>, FINISHED <dbl>,
# # Q_VIEWER <dbl>, LASTPAGE <dbl>, MAXPAGE <dbl>, MISSING <dbl>,
# # MISSREL <dbl>, TIME_RSI <chr>, DEG_TIME <dbl>
The problem is that you have several columns that are completely NA, namely SERIAL, REF, and MAILSENT.
sapply(df_literacy, function(a) table(c(T,F,is.na(a)))-1)
# CASE SERIAL REF QUESTNNR MODE STARTED EI01 EI02 RF01 RF02 RF03 RG01 RG02
# FALSE 20 0 0 20 20 20 20 20 20 20 20 20 20
# TRUE 0 20 20 0 0 0 0 0 0 0 0 0 0
# RG03 RG04 RG05 SD01 SD03 SD05_01 TIME001 TIME002 TIME003 TIME004 TIME005
# FALSE 20 20 20 20 20 20 20 20 20 20 20
# TRUE 0 0 0 0 0 0 0 0 0 0 0
# TIME006 TIME007 TIME_SUM MAILSENT LASTDATA FINISHED Q_VIEWER LASTPAGE
# FALSE 20 20 20 0 20 20 20 20
# TRUE 0 0 0 20 0 0 0 0
# MAXPAGE MISSING MISSREL TIME_RSI DEG_TIME
# FALSE 20 20 20 20 20
# TRUE 0 0 0 0 0
Drop the drop_na(), or at least drop_na(-SERIAL, -REF, -MAILSENT).
Your code is using funs, which has been deprecated since dplyr-0.8.0.
# Warning: `funs()` is deprecated as of dplyr 0.8.0.
# Please use a list of either functions or lambdas:
# # Simple named list:
# list(mean = mean, median = median)
# # Auto named with `tibble::lst()`:
# tibble::lst(mean, median)
# # Using lambdas
# list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
While this isn't causing an error, it is causing a warning (and will likely stop working at some point. Change your mutate_at to be:
mutate_at(.vars = vars(RG04, RF02),
.funs = list(prop = ~ . / max(.)))
You are using a single variable within .vars and a single function within .funs, so the column names are preserved as-is (and you will not see a _prop column). From ?mutate_at:
The names of the new columns are derived from the names of the
input variables and the names of the functions.
• if there is only one unnamed function (i.e. if '.funs' is an
unnamed list of length one), the names of the input variables
are used to name the new columns;
• for _at functions, if there is only one unnamed variable
(i.e., if '.vars' is of the form 'vars(a_single_column)') and
'.funs' has length greater than one, the names of the
functions are used to name the new columns;
• otherwise, the new names are created by concatenating the
names of the input variables and the names of the functions,
separated with an underscore '"_"'.
If you aren't going to add more variables and functions, then you need to self-name it in the call, as in mutate_at(.vars = vars(RG04 = RG04), ...). Oddly enough, this causes it to produce RG04_prop.
If we fix all of those, then it works.
df_literacy %>%
drop_na(-SERIAL, -REF, -MAILSENT) %>%
mutate_at(.vars = vars(RG04 = RG04),
.funs = list(prop = ~ ./max(.))) %>%
select(contains("_prop")) %>%
head(3)
# A tibble: 3 x 1
# RG04_prop
# <dbl>
# 1 0.5
# 2 1
# 3 0.5

Summary with label names with dplyr

I have imported a .sav file with Haven but where I am stuck is that I cant seem to work out how to print the label names in place or, with the label codings. Labels: 1 = unemployed, 2 = looking etc.
Employment <- select(well_being_df, EmploymentStatus, Gender) %>% <group_by(EmploymentStatus) %>% summarise_all(funs(mean, n = n(), sd,min(.,is.na = TRUE), max(.,is.na = TRUE)))
# A tibble: 5 x 6
EmploymentStatus mean n sd min max
<dbl+lbl> <dbl> <int> <dbl> <dbl> <dbl>
1 1 1.67 12 0.492 1 2
2 2 1.17 6 0.408 1 2
3 3 1.8 85 0.431 1 3
4 4 1.5 62 0.504 1 2
5 5 1.5 4 0.577 1 2
Ideally:
# A tibble: 5 x 6
EmploymentStatus mean n sd min max
<dbl+lbl> <dbl> <int> <dbl> <dbl> <dbl>
1 1 Unemployed 1.67 12 0.492 1 2
2 2 Looking 1.17 6 0.408 1 2
3 3 Etc 1.8 85 0.431 1 3
4 4 1.5 62 0.504 1 2
5 5 1.5 4 0.577 1 2
dput(head(well_being_df, 10))
structure(list(Age = c(22, 20, 23, 20, 25, 18, 24, 21, 21, 30.7344197070233
), Gender = structure(c(2, 2, 1, 2, 1, 2, 2, 2, 2, 1), labels = c(Male = 1,
Female = 2, Transgender = 3), class = "labelled"), EmploymentStatus = structure(c(3,
1, 4, 3, 3, 3, 3, 4, 3, 4), labels = c(`Unemployed but not looking` = 1,
`Unemployed and looking` = 2, `Part-time` = 3, `Full-time` = 4,
Retired = 5), class = "labelled"), Cognition1 = structure(c(6,
3, 6, 5, 9, 6, 4, 4, 7, 5), labels = c(`Provides nothing that you want` = 0,
`Provides half of what you want` = 5, `Provides all that you want` = 10
), class = "labelled"), Cognition2 = structure(c(7, 3, 8,
5, 8, 5, 5, 7, 7, 3), labels = c(`Far below average` = 0,
`About Average` = 5, `Far above average` = 10), class = "labelled"),
Cognition3 = structure(c(6, 5, 4, 5, 6, 5, 5, 5, 5, 5), labels = c(`Far less than you deserve` = 0,
`About what you deserve` = 5, `Far more than you deserve` = 10
), class = "labelled"), Cognition4 = structure(c(7, 3, 6,
2, 8, 3, 3, 5, 6, 2), labels = c(`Far less than you need` = 0,
`About what you need` = 5, `Far more than you need` = 10), class = "labelled"),
Cognition5 = structure(c(10, 9, 6, 3, 7, 2, 2, 0, 4, 0), labels = c(`Far less than expected` = 0,
`About as expected` = 5, `Far more than expected` = 10), class = "labelled"),
Cognition6 = structure(c(8, 6, 0, 3, 3, 8, 9, 10, 5, 10), labels = c(`Far more than it will in the future` = 0,
`About what you expect in the future` = 5, `Far less than what the future will offer` = 10
), class = "labelled"), Cognition7 = structure(c(9, 7, 10,
5, 6, 2, 3, 0, 8, 3), labels = c(`Far below previous best` = 0,
`Equals previous best` = 5, `Far above previous best` = 10
), class = "labelled")), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"))

Employment <- select(well_being_df, EmploymentStatus, Gender) %>%
mutate(EmploymentStatus = labelled::to_factor(EmploymentStatus)) %>% # use labelled package
group_by(EmploymentStatus) %>%
summarise_all(funs(mean, n = n(), sd,min(.,is.na = TRUE), max(.,is.na = TRUE)))

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

R Pivot Longer With Multiple Columns - r

Related

variable based on other variables in R

Remove rows from list of dataframes based on condition

How to sort each column of a df in descending order regarless of the row order?

Tried code in R with mutate_at and max() functions with own data. Warning messages come up: no non-missing arguments to max

Summary with label names with dplyr

Categories

Resources