data frame selecting top by grouping - r

I have a data frame such as:
set.seed(1)
df <- data.frame(
sample = 1:50,
value = runif(50),
group = c(rep(NA, 20), gl(3, 10)))
I want to select the top 10 samples based on value. However, if there is a group corresponding to the sample, I only want to include one sample from that group. If group == NA, I want to include all of them. Arranging df by value looks like:
df_top <- df %>%
arrange(-value) %>%
top_n(10, value)
sample value group
1 46 0.7973088 3
2 49 0.8108702 3
3 22 0.8394404 1
4 2 0.8612095 NA
5 27 0.8643395 1
6 20 0.8753213 NA
7 44 0.8762692 3
8 26 0.8921983 1
9 11 0.9128759 NA
10 30 0.9606180 1
I would want to include samples 36, 22, 2, 20, 11, and the next five highest values in my data frame that continue to fit the pattern. How do I accomplish this?

I think I figured this out. Would this be the best way:
df_top <- df %>%
arrange(-value) %>%
group_by(group) %>%
filter(ifelse(!is.na(group), value == max(value), value == value)) %>%
ungroup() %>%
top_n(10, value)
# A tibble: 10 x 3
sample value group
<int> <dbl> <int>
1 18 0.992 NA
2 7 0.945 NA
3 21 0.935 1
4 4 0.908 NA
5 6 0.898 NA
6 35 0.827 2
7 41 0.821 3
8 20 0.777 NA
9 15 0.770 NA
10 17 0.718 NA

Similar method that uses slice instead of filter:
library(dplyr)
df_top <- df %>%
arrange(-value) %>%
group_by(group) %>%
slice(if(any(!is.na(group))) 1 else 1:n()) %>%
ungroup() %>%
top_n(10, value)
Result:
# A tibble: 10 x 3
sample value group
<int> <dbl> <int>
1 21 0.9347052 1
2 35 0.8273733 2
3 41 0.8209463 3
4 18 0.9919061 NA
5 7 0.9446753 NA
6 4 0.9082078 NA
7 6 0.8983897 NA
8 20 0.7774452 NA
9 15 0.7698414 NA
10 17 0.7176185 NA

Related

How to reorder column values ascending order that are seperated by "," and only keep first value in R

I have a column in a df that consists of values like so:
ID
2
NA
1
3
4
5,7
9,6,10
12
15
16
17
NA
19
22,23
I would like to reorder every row based on ascending order. Note - this column is a "character" based field and some rows are already in the correct order.
From there, I only want to keep the first value and remove the others.
Desired output:
ID
2
NA
1
3
4
5
6
12
15
16
17
NA
19
22
You can split the data on comma, sort them and extract the 1st value.
df$ID <- sapply(strsplit(df$ID, ','), function(x) sort(as.numeric(x))[1])
# ID
#1 2
#2 NA
#3 1
#4 3
#5 4
#6 5
#7 6
#8 12
#9 15
#10 16
#11 17
#12 NA
#13 19
#14 22
A couple of tidyverse alternatives.
library(tidyverse)
#1.
#Same as base R but in tidyverse
df %>% mutate(ID = map_dbl(str_split(ID, ','), ~sort(as.numeric(.x))[1]))
#2.
df %>%
mutate(row = row_number()) %>%
separate_rows(ID, sep = ',', convert = TRUE) %>%
group_by(row) %>%
summarise(ID = min(ID)) %>%
select(-row)
Here is another tidyverse solution: Making use of (dyplr, purrr, stringr and readr
library(tidyverse)
df %>%
mutate(ID = map_chr(str_split(ID, ","), ~
toString(sort(as.numeric(.x)))),
ID = parse_number(ID))
)
output:
ID
1 2
2 NA
3 1
4 3
5 4
6 5
7 6
8 12
9 15
10 16
11 17
12 NA
13 19
14 22
We may use the minimum instead of sorting / extracting:
DF <- transform(DF, ID=sapply(strsplit(ID, ','), \(x) min(as.double(x))))
DF
# ID
# 1 2
# 2 NA
# 3 1
# 4 3
# 5 4
# 6 5
# 7 6
# 8 12
# 9 15
# 10 16
# 11 17
# 12 NA
# 13 19
# 14 22
We could use str_extract
library(stringr)
library(dplyr)
df1 %>%
mutate(ID = as.numeric(str_extract(ID, '\\d+')))
-output
ID
1 2
2 NA
3 1
4 3
5 4
6 5
7 9
8 12
9 15
10 16
11 17
12 NA
13 19
14 22
data
df1 <- structure(list(ID = c("2", NA, "1", "3", "4", "5,7", "9,6,10",
"12", "15", "16", "17", NA, "19", "22,23")), class = "data.frame", row.names = c(NA,
-14L))

Calculating a ratio from two columns of data by parameters set in another column

I have date values in wide from and I'm trying to calculate the ratio of the date value with the baseline only within the Start Date and End Dates.
For example:
ID Start Date End Date Baseline 1/18 2/18 3/18 4/18 5/18 6/18 7/18 8/18
A 1/1/2018 5/1/2018 5 2 4 1 3 5 2 4 5
B 6/1/2018 8/1/2018 2 4 2 4 3 6 6 2 1
C 2/1/2018 3/1/2018 8 3 5 5 3 2 7 8 2
D 5/1/2015 7/1/2018 9 1 3 5 7 4 8 9 1
I would like to output to be:
ID Start Date End Date Baseline 1/18 2/18 3/18 4/18 5/18 6/18 7/18 8/18
A 1/1/2018 5/1/2018 5 0.4 0.8 0.2 0.6 1
B 6/1/2018 8/1/2018 2 3 1 0.5
C 2/1/2018 3/1/2018 8 0.625 0.625
D 5/1/2015 7/1/2018 9 0.44 0.88 1
Thank you!
A very inelegant solution with dplyr and tidyr, which someone can probably build on:
library(dplyr)
library(tidyr)
sample <- sample %>% mutate_at(vars(5:12), funs(round(./Baseline, digits = 3))) ## perform the initial simple proportion calculation
sample <- sample %>% gather(5:12, key = "day", value = "value") %>%
rowwise() %>% ## allow for rowwise operations
mutate(value_temp = case_when(any(grepl(as.numeric(str_extract(day, "^[:digit:]{1,2}(?=/)")),
as.numeric(str_extract(StartDate, "^[:digit:]{1,2}(?=/)")):as.numeric(str_extract(EndDate, "^[:digit:]{1,2}(?=/)")))) == T ~ T,
TRUE ~ NA)) ## create a logical vector which indicates TRUE if the "day" is included in the range of days of StartDate and EndDate
sample$value[is.na(sample$value_temp)] <- NA ## sets values which aren't included in the vector of days to NA
sample$value_temp <- NULL ## remove the temp variable
sample <- sample %>% spread(day, value) ## spread to original df
> sample
# A tibble: 4 x 12
ID StartDate EndDate Baseline `1/18` `2/18` `3/18` `4/18` `5/18` `6/18` `7/18` `8/18`
<chr> <chr> <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 1/1/2018 5/1/2018 5 0.4 0.8 0.2 0.6 1 NA NA NA
2 B 6/1/2018 8/1/2018 2 NA NA NA NA NA 3 1 0.5
3 C 2/1/2018 3/1/2018 8 NA 0.625 0.625 NA NA NA NA NA
4 D 5/1/2015 7/1/2018 9 NA NA NA NA 0.444 0.889 1 NA
Update:
sample <- sample %>% mutate_at(vars(5:12), funs(round(./Baseline, digits = 3)))
sample <- sample %>% gather(5:12, key = "day", value = "value") %>%
rowwise() %>%
mutate(value_temp = case_when(any(grepl(as.numeric(str_extract(day, "^[:digit:]{1,2}(?=/)")),
as.numeric(str_extract(Start_Date, "^[:digit:]{1,2}(?=/)")):as.numeric(str_extract(End_Date, "^[:digit:]{1,2}(?=/)")))) == T &
any(grepl(as.numeric(str_extract(day, "[:digit:]{2}$")),
as.numeric(str_extract(Start_Date, "[:digit:]{2}$")):as.numeric(str_extract(End_Date, "[:digit:]{2}$")))) ~ T,
TRUE ~ NA))
sample$value[is.na(sample$value_temp)] <- NA
sample$value_temp <- NULL
sample$day <- sample$day %>% as_factor()
sample <- sample %>% spread(day, value)

Repeated measures in messy format, need help to tidy

I have a very large data set containing weekly weights that have been coded with week of study and the weight at that visit. There are some missing visits and the data is not currently aligned.
df <- data.frame(ID=1:3, Week_A=c(6,6,7), Weight_A=c(23,24,23), Week_B=c(7,7,8),
Weight_B=c(25,26,27), Week_C=c(8,9,9), Weight_C=c(27,26,28))
df
ID Week_A Weight_A Week_B Weight_B Week_C Weight_C
1 1 6 23 7 25 8 27
2 2 6 24 7 26 9 26
3 3 7 23 8 27 9 28
I would like to align the data by week number (ideal output below).
df_ideal <- data.frame (ID=1:3, Week_6=c(23,24,NA), Week_7=c(25,26,23),
Week_8=c(27,NA,27), Week_9=c(NA,26,28))
df_ideal
ID Week_6 Week_7 Week_8 Week_9
1 1 23 25 27 NA
2 2 24 26 NA 26
3 3 NA 23 27 28
I would appreciate some help with this, even to find a starting point to manipulate this data to an easier to manage format.
A tidyverse solution:
df <- data.frame(ID=1:3,
Week_A=c(6,6,7),
Weight_A=c(23,24,23),
Week_B=c(7,7,8),
Weight_B=c(25,26,27),
Week_C=c(8,9,9),
Weight_C=c(27,26,28))
library(tidyverse)
df_long <- df %>% gather(key="v", value="value", -ID) %>%
separate(v, into=c("v1", "v2")) %>%
spread(v1, value) %>%
complete(ID, Week) %>%
arrange(Week, ID)
df_long
# A tibble: 12 x 4
# ID Week v2 Weight
# <int> <dbl> <chr> <dbl>
# 1 1 6 A 23
# 2 2 6 A 24
# 3 3 6 <NA> NA
# 4 1 7 B 25
# 5 2 7 B 26
# 6 3 7 A 23
# 7 1 8 C 27
# 8 2 8 <NA> NA
# 9 3 8 B 27
#10 1 9 <NA> NA
#11 2 9 C 26
#12 3 9 C 28
df_wide <- df_long %>% select(-v2) %>%
spread(Week, Weight, sep="_")
df_wide
# A tibble: 3 x 5
# ID Week_6 Week_7 Week_8 Week_9
# <int> <dbl> <dbl> <dbl> <dbl>
#1 1 23 25 27 NA
#2 2 24 26 NA 26
#3 3 NA 23 27 28
Personally, I'd keep using df_long instead of df_wide, as it is a tidy data frame, while df_wide is not.
Here is a possible approach using the data.table package
library(data.table)
#convert into a data.table
setDT(df)
#convert into a long format
mdat <- melt(df, id.vars="ID", measure.vars=patterns("^Week", "^Weight", cols=names(df)))
#pivot into desired output
ans <- dcast(mdat, ID ~ value1, value.var="value2")
ans output:
ID 6 7 8 9
1: 1 23 25 27 NA
2: 2 24 26 NA 26
3: 3 NA 23 27 28
And if you really need the "Week_" in your column names, you can use
setnames(ans, names(ans)[-1L], paste("Week_", names(ans)[-1L]))
Another tidyverse solution using a double-gather with a final spread
df %>%
gather(k, v, -ID, -starts_with("Weight")) %>%
separate(k, into = c("k1", "k2")) %>%
unite(k1, k1, v) %>%
gather(k, v, starts_with("Weight")) %>%
separate(k, into = c("k3", "k4")) %>%
filter(k2 == k4) %>%
select(-k2, -k3, -k4) %>%
spread(k1, v)
# ID Week_6 Week_7 Week_8 Week_9
#1 1 23 25 27 NA
#2 2 24 26 NA 26
#3 3 NA 23 27 28
In base R, it's a double reshape, firstly to long and then back to wide on a different variable:
tmp <- reshape(df, idvar="ID", varying=lapply(c("Week_","Weight_"), grep, names(df)),
v.names=c("time","Week"), direction="long")
reshape(tmp, idvar="ID", direction="wide", sep="_")
# ID Week_6 Week_7 Week_8 Week_9
#1.1 1 23 25 27 NA
#2.1 2 24 26 NA 26
#3.1 3 NA 23 27 28

Calculating proportions and ignore NAs

I have a dataset similar to the following and my end goal is to make a table showing variables like mean salary per gender and the females' mean salary as a proportion of men's.
library(dplyr)
x <- data.frame(Department = c("Dep1", "Dep1","Dep2", "Dep2","Dep3"),
Gender = c("F", "M", "F", "M", "F"),
Salary = seq(10,14))
Department Gender Salary
1 Dep1 F 10
2 Dep1 M 11
3 Dep2 F 12
4 Dep2 M 13
5 Dep3 F 14
Step 1: First I calculate the needed summary statistics using summarise.
Table <- x %>% group_by(Department, Gender) %>% summarise(Count = n(),
AverageSalary = mean(Salary, na.rm = T),
MedianSalary = median(Salary, na.rm = T))
Step 2: To calculate the proportion and add the new columns to "Table" I use a tip I got from this forum a few days ago.
Table %>% group_by(Department) %>%
mutate(`AvgSalaryWomen/Men` = AverageSalary[Gender == "F"]/AverageSalary[Gender == "M"],
`MedianSalaryWomen/Men` = MedianSalary[Gender == "F"]/MedianSalary[Gender == "M"])
My challenge is that Dep3 doesn't have any males and so I get the following error message:
Error in mutate_impl(.data, dots) :
Column `AvgSalaryWomen/Men` must be length 1 (the group size), not 0
What I was hoping for was something like this
Department Gender Count AverageSalary MedianSalary AvgSalaryWomen.Men MedianSalaryWomen.Men
1 Dep1 F 1 10 10 0.9090909 0.9090909
2 Dep1 M 1 11 11 0.9090909 0.9090909
3 Dep2 F 1 12 12 0.9230769 0.9230769
4 Dep2 M 1 13 13 0.9230769 0.9230769
5 Dep3 F 1 14 14 NA NA
or this
Department Gender Count AverageSalary MedianSalary AvgSalaryWomen.Men MedianSalaryWomen.Men
1 Dep1 F 1 10 10 0.9090909 0.9090909
2 Dep1 M 1 11 11 NA NA
3 Dep2 F 1 12 12 0.9230769 0.9230769
4 Dep2 M 1 13 13 NA NA
5 Dep3 F 1 14 14 NA NA
Is there an easy way to obtain either of these two results? I'm guessing that alternative 1 would be the easiest.
Thanks in advance!
Using ifelse, you can check if both genders exist in a department before computing the ratios (and if not, returning NA). Something like this:
Table %>% group_by(Department) %>%
mutate(`AvgSalaryWomen/Men` = ifelse(length(unique(Gender)) == 2,
AverageSalary[Gender == "F"]/AverageSalary[Gender == "M"], NA),
`MedianSalaryWomen/Men` = ifelse(length(unique(Gender)) == 2,
MedianSalary[Gender == "F"]/MedianSalary[Gender == "M"], NA))
# A tibble: 5 x 7
# Groups: Department [3]
Department Gender Count AverageSalary MedianSalary `AvgSalaryWomen/Men` `MedianSalaryWomen/Men`
<fct> <fct> <int> <dbl> <int> <dbl> <dbl>
1 Dep1 F 1 10.0 10 0.909 0.909
2 Dep1 M 1 11.0 11 0.909 0.909
3 Dep2 F 1 12.0 12 0.923 0.923
4 Dep2 M 1 13.0 13 0.923 0.923
5 Dep3 F 1 14.0 14 NA NA

how to replace missing values with previous year's binned mean

I have a data frame as below
p1_bin and f1_bin are calculated by cut function by me with
Bins <- function(x) cut(x, breaks = c(0, seq(1, 1000, by = 5)), labels = 1:200)
binned <- as.data.frame (sapply(df[,-1], Bins))
colnames(binned) <- paste("Bin", colnames(binned), sep = "_")
df<- cbind(df, binned)
Now how to calculate mean/avg for previous two years and replace in NA values with in that bin
for example : at row-5 value is NA for p1 and f1 is 30 with corresponding bin 7.. now replace NA with previous 2 years mean for same bin (7) ,i.e
df
ID year p1 f1 Bin_p1 Bin_f1
1 2013 20 30 5 7
2 2013 24 29 5 7
3 2014 10 16 2 3
4 2014 11 17 2 3
5 2015 NA 30 NA 7
6 2016 10 NA 2 NA
df1
ID year p1 f1 Bin_p1 Bin_f1
1 2013 20 30 5 7
2 2013 24 29 5 7
3 2014 10 16 2 3
4 2014 11 17 2 3
5 2015 **22** 30 NA 7
6 2016 10 **16.5** 2 NA
Thanks in advance
I believe the following code produces the desired output. There's probably a much more elegant way than using mean(rev(lag(f1))[1:2]) to get the average of the last two values of f1 but this should do the trick anyway.
library(dplyr)
df %>%
arrange(year) %>%
mutate_at(c("p1", "f1"), "as.double") %>%
group_by(Bin_p1) %>%
mutate(f1 = ifelse(is.na(f1), mean(rev(lag(f1))[1:2]), f1)) %>%
group_by(Bin_f1) %>%
mutate(p1 = ifelse(is.na(p1), mean(rev(lag(p1))[1:2]), p1)) %>%
ungroup
and the output is:
# A tibble: 6 x 6
ID year p1 f1 Bin_p1 Bin_f1
<int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 2013 20 30.0 5 7
2 2 2013 24 29.0 5 7
3 3 2014 10 16.0 2 3
4 4 2014 11 17.0 2 3
5 5 2015 22 30.0 NA 7
6 6 2016 10 16.5 2 NA

Resources