Calculate mean by group among observation without NA - r

Updated:
Hi! I have a data like this.
structure(list(V1QB10 = c(1, 1, 1, 2, 1, 3, 3, 1, 4, 2), V1QB12A = c(2,
1, 2, 3, NA, 2, 2, 3, 2, 2), V1QB12B = c(NA, 2, 2, 2, 2, 1, 2,
2, 2, 2), V1QB12C = c(NA, 1, 2, 2, 2, 1, 2, 2, 2, 2), sum = c(NA,
4, 6, 7, NA, 4, 6, 7, 6, 6)), row.names = c(NA, 10L), class = "data.frame")
This is how the data looks like:
V1QB10 V1QB12A V1QB12B V1QB12C sum
1 1 2 NA NA NA
2 1 1 2 1 4
3 1 2 2 2 6
4 2 3 2 2 7
5 1 NA 2 2 NA
6 3 2 1 1 4
7 3 2 2 2 6
8 1 3 2 2 7
9 4 2 2 2 6
10 2 2 2 2 6
Variable "sum" is the sum of "V1QB12*".
Now I'm trying to calculate the mean of the "sum" by "V1QB10":
dt %>%
group_by(V1QB10) %>%
dplyr::summarise(n=n(), mean=mean(sum), sd=sd(sum)) %>%
as.data.frame()
I'm expect the calculation like:
for V1QB10==1, the n is 3 (remove 2 observations with NA in "V1QB12*"), and sum up the "sum": 4+6+7=17, then calculate the mean: 17/3, and the sd.
But I found that I keep getting mean of 17/5. Trying to replace the code with n=n(V1QB12A) also didn't work.
Maybe I'm overthinking this problem. How I'm gonna do to fix it?
Thank you!

I'm not completely sure I follow what you're looking for, but the dplyr package has a nifty drop_na() function that will remove the NA's if you use it like this:
dt <- dt %>%
drop_na() %>%
dplyr::mutate(sum=rowSums(dplyr::select(., contains("V1QB12")), na.rm=T))
dt %>%
group_by(V1QB10) %>%
dplyr::summarise(n=n(), mean=mean(sum), sd=sd(sum)) %>%
as.data.frame()
Result:
V1QB10 n mean sd
1 1 3 5.666667 1.5275252
2 2 2 6.500000 0.7071068
3 3 2 5.000000 1.4142136
4 4 1 6.000000 NA

Related

Apply same recoding rules to multiple data frames

I have 5 data frames. I want to recode all variables ending with "_comfort", "_agree", and "effective" using the same rules for each data frame. As is, the values in each column are 1:5 and I want is to recode 5's to 1, 4's to 2, 2's to 4, and 5's to 1 (3 will stay the same).
I do not want the end result to one merged dataset, but instead to apply the same recoding rules across all 5 independent data frames. For simplicity sake, let's just assume I have 2 data frames:
df1 <- data.frame(a_comfort = c(1, 2, 3, 4, 5),
b_comfort = c(1, 2, 3, 4, 5),
c_effective = c(1, 2, 3, 4, 5))
df2 <- data.frame(a_comfort = c(1, 2, 3, 4, 5),
b_comfort = c(1, 2, 3, 4, 5),
c_effective = c(1, 2, 3, 4, 5))
What I want is:
df1 <- data.frame(a_comfort = c(5, 4, 3, 2, 1),
b_comfort = c(5, 4, 3, 2, 1),
c_effective = c(5, 4, 3, 2, 1))
df2 <- data.frame(a_comfort = c(5, 4, 3, 2, 1),
b_comfort = c(5, 4, 3, 2, 1),
c_effective = c(5, 4, 3, 2, 1))
Conventionally, I would use dplyr's mutate_atand ends_withto achieve my goal, but have not been successful with this method across multiple data frames. I am thinking a combination of the purr and dplyr packages will work, but haven't nailed down the exact technique.
Thanks in advance for any help!
You can use get() and assign() in a loop:
library(dplyr)
for (df_name in c("df1", "df2")) {
df <- mutate(
get(df_name),
across(
ends_with(c("_comfort", "_agree", "_effective")),
\(x) 6 - x
)
)
assign(df_name, df)
}
Result:
#> df1
a_comfort b_comfort c_effective
1 5 5 5
2 4 4 4
3 3 3 3
4 2 2 2
5 1 1 1
#> df2
a_comfort b_comfort c_effective
1 5 5 5
2 4 4 4
3 3 3 3
4 2 2 2
5 1 1 1
Note, however, it’s often better practice to keep multiple related dataframes in a list than loose in the global environment (see). In this case, you can use purrr::map() (or base::lapply()):
library(dplyr)
library(purrr)
dfs <- list(df1, df2)
dfs <- map(
dfs,
\(df) mutate(
df,
across(
ends_with(c("_comfort", "_agree", "_effective")),
\(x) 6 - x
)
)
)
Result:
#> dfs
[[1]]
a_comfort b_comfort c_effective
1 5 5 5
2 4 4 4
3 3 3 3
4 2 2 2
5 1 1 1
[[2]]
a_comfort b_comfort c_effective
1 5 5 5
2 4 4 4
3 3 3 3
4 2 2 2
5 1 1 1
You can use ls(pattern = 'df\\d+') to find all objects whose names match a certain pattern. Then store them into a list and pass to purrr::map or lapply for recoding.
library(dplyr)
df.lst <- purrr::map(
mget(ls(pattern = 'df\\d+')),
~ .x %>% mutate(6 - across(ends_with(c("_comfort", "_agree", "effective"))))
)
# $df1
# a_comfort b_comfort c_effective
# 1 5 5 5
# 2 4 4 4
# 3 3 3 3
# 4 2 2 2
# 5 1 1 1
#
# $df2
# a_comfort b_comfort c_effective
# 1 5 5 5
# 2 4 4 4
# 3 3 3 3
# 4 2 2 2
# 5 1 1 1
You can further overwrite those dataframes in your workspace from the list through list2env().
list2env(df.lst, .GlobalEnv)
Please try the below code where i convert the columns to factor and then recode them
data
a_comfort b_comfort c_effective
1 1 1 1
2 2 2 2
3 3 3 3
4 4 4 4
5 5 5 5
code
library(tidyverse)
df1 %>% mutate(across(c(ends_with('comfort'),ends_with('effective')), ~ factor(.x, levels=c('1','2','3','4','5'), labels=c('5','4','3','2','1'))))
output
a_comfort b_comfort c_effective
1 5 5 5
2 4 4 4
3 3 3 3
4 2 2 2
5 1 1 1

Select rows up to certain value in R

I have the following dataframe:
df1 <- data.frame(ID = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2),
var1 = c(0, 2, 3, 4, 2, 5, 6, 10, 11, 0, 1, 2, 1, 5, 7, 10))
I want to select only the rows containing values up to 5, once 5 is reached I want it to go to the next ID and select only values up to 5 for that group so that the final result would look like this:
ID var1
1 0
1 2
1 3
1 4
1 2
1 5
2 0
2 1
2 2
2 1
2 5
I would like to try something with dplyr as it is what I am most familiar with.
You could use which.max() to find the first occurrence of var1 >= 5, and then extract those rows whose row numbers are before it.
library(dplyr)
df1 %>%
group_by(ID) %>%
filter(row_number() <= which.max(var1 >= 5)) %>%
ungroup()
or
df1 %>%
group_by(ID) %>%
slice(1:which.max(var1 >= 5)) %>%
ungroup()
# # A tibble: 11 × 2
# ID var1
# <dbl> <dbl>
# 1 1 0
# 2 1 2
# 3 1 3
# 4 1 4
# 5 1 2
# 6 1 5
# 7 2 0
# 8 2 1
# 9 2 2
# 10 2 1
# 11 2 5

Grouped non-dense rank without omitted values

I have the following data.frame:
df <- data.frame(date = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3),
id = c(4, 4, 2, 4, 1, 2, 3, 1, 2, 2, 1, 1))
And I want to add a new column grp which, for each date, ranks the IDs. Ties should have the same value, but there should be no omitted values. That is, if there are two values which are equally minimum, they should both get rank 1, and the next lowest values should get rank 2.
The expected result would therefore look like this. Note that, as mentioned, the groups are for each date, so the operation must be grouped by date.
data.frame(date = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3),
id = c(4, 4, 2, 4, 1, 2, 3, 1, 2, 2, 1, 1),
grp = c(2, 2, 1, 2, 1, 2, 3, 1, 2, 2, 1, 1))
I'm sure there's a trivial way to do this but I haven't found it: none of the options for tie.method behave in this way (data.table::frank also doesn't help, since it only adds a dense rank).
I thought of doing a normal rank and then using data.table::rleid, but that doesn't work if there are duplicate values separated by other values during the same day.
I also thought of grouping by date and id and then using a group-ID, but the lowest values each day must start at rank 1, so that won't work either.
The only functional solution I've found is to create another table with the unique ids per day and then join that table to this one:
suppressPackageStartupMessages(library(dplyr))
df <- data.frame(date = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3),
id = c(4, 4, 2, 4, 1, 2, 3, 1, 2, 2, 1, 1))
uniques <- df %>%
group_by(
date
) %>%
distinct(
id
) %>%
mutate(
grp = rank(id)
)
df <- df %>% left_join(
unique
) %>% print()
#> Joining, by = c("date", "id")
#> date id grp
#> 1 1 4 2
#> 2 1 4 2
#> 3 1 2 1
#> 4 1 4 2
#> 5 2 1 1
#> 6 2 2 2
#> 7 2 3 3
#> 8 2 1 1
#> 9 3 2 2
#> 10 3 2 2
#> 11 3 1 1
#> 12 3 1 1
Created on 2020-05-08 by the reprex package (v0.3.0)
However, this seems quite inelegant and convoluted for what seems like a simple operation, so I'd rather see if other solutions are available.
Curious to see data.table solutions if available, but unfortunately the solution must be in dplyr.
We can use dense_rank
library(dplyr)
df %>%
group_by(date) %>%
mutate(grp = dense_rank(id))
# A tibble: 12 x 3
# Groups: date [3]
# date id grp
# <dbl> <dbl> <int>
# 1 1 4 2
# 2 1 4 2
# 3 1 2 1
# 4 1 4 2
# 5 2 1 1
# 6 2 2 2
# 7 2 3 3
# 8 2 1 1
# 9 3 2 2
#10 3 2 2
#11 3 1 1
#12 3 1 1
Or with frank
library(data.table)
setDT(df)[, grp := frank(id, ties.method = 'dense'), date]

Generate random sequential number by group with multiple times

I'm trying to generate random number by group with multiple times.
For example,
> set.seed(1002)
> df<-data.frame(ID=LETTERS[seq(1:5)],num=sample(c(2,3,4), size=5, replace=TRUE))
> df
ID num
1 A 3
2 B 4
3 C 3
4 D 2
5 E 3
In ID, I want to generate sequential random number without replacement with (for example) 4 times.
If ID is A, it will randomly select numbers among 1:3 4 times. So, this will be
sample(c(1,2,3,1,2,3,1,2,3),replace=FALSE)
or
ep(sample(c(1:4), replace=FALSE),times=4)
If the results is 3 2 1 2 1 3 2 3 3 1 1 2, then the data will be
ID num
1 A 3
2 A 2
3 A 2
4 A 1
5 A 1
6 A 3
7 A 2
8 A 1
9 A 3
I tried several things, like
df%>%group_by(ID)%>%mutate(random=sample(rep(1:num,times=4),replace=FALSE))
It failed. The warning appeared with In 1:num
I also tried this.
ddply(df,.(ID),function(x) sample(rep(1:num,times=4),replace=FALSE))
The error appeared again, with NA/NaN.
I would really appreciate if you let me know how to solve this problem.
We can create a list-column and then unnest it to have separate rows.
n <- 4
library(dplyr)
df %>%
group_by(ID) %>%
mutate(num = list(sample(rep(seq_len(num), n)))) %>%
tidyr::unnest(num)
# ID num
# <fct> <int>
# 1 A 2
# 2 A 2
# 3 A 2
# 4 A 3
# 5 A 3
# 6 A 1
# 7 A 3
# 8 A 1
# 9 A 1
#10 A 3
# … with 50 more rows
I'm not quite clear on your expected output.
The following samples num elements from 1:num with replacement, and stores samples in a list column sample.
library(tidyverse)
set.seed(2018)
df %>% mutate(sample = map(num, ~sample(1:.x, replace = T)))
# ID num sample
#1 A 2 1, 1
#2 B 4 3, 4, 1, 2
#3 C 2 1, 1
#4 D 4 3, 3, 4, 4
#5 E 2 2, 2
Or if you want to repeat sampling num elements (with replacement) 4 times, you can do
set.seed(2018)
df %>%
mutate(sample = map(num, ~as.numeric(replicate(4, sample(1:.x, replace = T)))))
#ID num sample
#1 A 2 1, 1, 1, 2, 1, 2, 1, 1
#2 B 4 3, 3, 4, 4, 4, 4, 4, 2, 3, 4, 3, 3, 2, 1, 1, 2
#3 C 2 1, 1, 1, 1, 1, 1, 1, 2
#4 D 4 2, 3, 2, 1, 3, 4, 1, 2, 1, 2, 2, 1, 1, 1, 2, 1
#5 E 2 2, 1, 2, 2, 1, 1, 1, 2

How to sum a substring reference

I'm attempting to select the correct column to sum the total of a from within a data frame column using ddply:
df2 <- ddply(df1,'col1', summarise, total = sum(substr(variable,1,3)))
It appears not to be working because you can't sum a character, but I am trying to pass the reference to the column, not sum the literal result of the substring. Is there a way to get around this?
Example Data & Desired output:
variable = "Aug 2017"
col1 Jun Jul Aug
1 A 1 2 3
2 A 1 2 3
3 A 1 2 3
4 A 1 2 3
5 A 1 2 3
6 B 2 3 4
7 B 2 3 4
8 B 2 3 4
9 C 3 4 5
10 C 3 4 5
Desired Output:
1 A 15
2 B 12
3 C 10
This works with dplyr instead of plyr.
# create data
df1 <- data.frame(
col1 = c(rep('A', 5), rep('B', 3), rep('C', 2)),
Jun = c(1, 1, 1, 1, 1, 2, 2, 2, 3, 3),
Jul = c(2, 2, 2, 2, 2, 3, 3, 3, 4, 4),
Aug = c(3, 3, 3, 3, 3, 4, 4, 4, 5, 5))
variable = 'Aug 2017'
# load dplyr library
library(dplyr)
# summarize each column that matches some string
df1 %>%
select(col1, matches(substr(variable, 1, 3))) %>%
group_by(col1) %>%
summarize_each(funs = 'sum')
# A tibble: 3 × 2
col1 Aug
<fctr> <dbl>
1 A 15
2 B 12
3 C 10
I also highly recommend reading about nonstandard and standard evaluation, here:
http://adv-r.had.co.nz/Computing-on-the-language.html

Resources