Ordering a DataFrame by a bigger group (R) - r

I want to sort a dataframe by a 'sum' of a group. So, I don't want the data frame to be ordered by group, but by the total amount of a group. I.e. I want to know which group is the biggest, 1 or 2 or 3 and then order the values according to that. So, say group 3 is the biggest group, then I want group 3 at the top and I want the values of group 3 in descending order.
set.seed(123)
d <- data.frame(
x = runif(90),
grp = gl(3, 30))
Thanks!

One way using dplyr is
d %>%
group_by(grp) %>%
mutate(sum_ = sum(x)) %>%
arrange(desc(sum_), desc(x)) %>%
select(-sum_)
Basically, we create a temporary variable sum_ that indicates the sum of x by group, and then arrange according to sum_ first and x second. Afterwards, we remove sum_ since it's no longer needed.
Output
# A tibble: 90 x 2
# Groups: grp [3]
# x grp
# <dbl> <fct>
# 1 0.994 1
# 2 0.957 1
# 3 0.955 1
# 4 0.940 1
# 5 0.900 1
# 6 0.892 1
# 7 0.890 1
# 8 0.883 1
# 9 0.788 1
# 10 0.709 1
# ... with 80 more rows

Another option is to reorder the levels of grp according to the sum of x. This can be done with tidyverse like this:
library(tidyverse)
set.seed(123)
df <- tibble(
x = runif(90),
grp = gl(3, 30)
)
df %>%
mutate(
grp = fct_reorder(grp, x, .fun = "sum", .desc = TRUE)
) %>%
arrange(grp)
#> # A tibble: 90 x 2
#> x grp
#> <dbl> <fct>
#> 1 0.288 1
#> 2 0.788 1
#> 3 0.409 1
#> 4 0.883 1
#> 5 0.940 1
#> 6 0.0456 1
#> 7 0.528 1
#> 8 0.892 1
#> 9 0.551 1
#> 10 0.457 1
#> # … with 80 more rows
df %>%
group_by(grp) %>%
summarise(tot = sum(x))
#> # A tibble: 3 x 2
#> grp tot
#> <fct> <dbl>
#> 1 1 17.2
#> 2 2 13.2
#> 3 3 15.4
Created on 2020-06-17 by the reprex package (v0.3.0)

In base you can first use order on all, than split by grp and rbind by the order of rowsum.
d <- d[order(-d$x),]
do.call(rbind, split(d, d$grp)[order(-rowsum(d$x, d$grp))])
# x grp
#1.24 0.9942697766 1
#1.11 0.9568333453 1
#1.20 0.9545036491 1
#1.5 0.9404672843 1
#1.16 0.8998249704 1
#1.8 0.8924190444 1
#1.21 0.8895393161 1
#1.4 0.8830174040 1
#1.2 0.7883051354 1
#1.26 0.7085304682 1
#1.22 0.6928034062 1
#1.13 0.6775706355 1
#1.25 0.6557057991 1
#1.23 0.6405068138 1
#1.28 0.5941420204 1
#1.14 0.5726334020 1
#1.9 0.5514350145 1
#1.27 0.5440660247 1
#1.7 0.5281054880 1
#1.10 0.4566147353 1
#1.12 0.4533341562 1
#1.3 0.4089769218 1
#1.19 0.3279207193 1
#1.29 0.2891597373 1
#1.1 0.2875775201 1
#1.17 0.2460877344 1
#1.30 0.1471136473 1
#1.15 0.1029246827 1
#1.6 0.0455564994 1
#1.18 0.0420595335 1
#3.87 0.9849569800 3
#3.88 0.8930511144 3
#3.89 0.8864690608 3
#3.65 0.8146400389 3
#3.68 0.8123895095 3
#3.67 0.8100643530 3
#3.69 0.7943423211 3
#3.84 0.7881958340 3
#3.71 0.7544751586 3
#3.73 0.7101824014 3
#3.82 0.6680555874 3
#3.61 0.6651151946 3
#3.72 0.6292211316 3
#3.78 0.6127710033 3
#3.75 0.4753165741 3
#3.66 0.4485163414 3
#3.70 0.4398316876 3
#3.86 0.4348927415 3
#3.83 0.4176467797 3
#3.63 0.3839696378 3
#3.77 0.3798165377 3
#3.79 0.3517979092 3
#3.64 0.2743836446 3
#3.81 0.2436194727 3
#3.76 0.2201188852 3
#3.90 0.1750526503 3
#3.80 0.1111354243 3
#3.85 0.1028646443 3
#3.62 0.0948406609 3
#3.74 0.0006247733 3
#2.31 0.9630242325 2
#2.32 0.9022990451 2
#2.59 0.8950453592 2
#2.50 0.8578277153 2
#2.53 0.7989248456 2
#2.34 0.7954674177 2
#2.37 0.7584595375 2
#2.58 0.7533078643 2
#2.33 0.6907052784 2
#2.55 0.5609479838 2
#2.36 0.4777959711 2
#2.48 0.4659624503 2
#2.52 0.4422000742 2
#2.42 0.4145463358 2
#2.43 0.4137243263 2
#2.60 0.3744627759 2
#2.44 0.3688454509 2
#2.39 0.3181810076 2
#2.49 0.2659726404 2
#2.47 0.2330340995 2
#2.40 0.2316257854 2
#2.38 0.2164079358 2
#2.56 0.2065313896 2
#2.45 0.1524447477 2
#2.41 0.1428000224 2
#2.46 0.1388060634 2
#2.57 0.1275316502 2
#2.54 0.1218992600 2
#2.51 0.0458311667 2
#2.35 0.0246136845 2
Another option is to reorder the levels of grp which can than be used in order.
d$grp <- factor(d$grp, levels(d$grp)[order(-rowsum(d$x, d$grp))])
d[order(d$grp, -d$x),]

Related

How to change iteratively the values of a column in R without a loop?

I would like to change iteratively the values of a column (value2 in the example). value2 at time i is conditioned by value1 and updated value2 at time i and i-1.
Time values are stocked in ascending order.
Treatment is done separetely for each value of the group colum.
But as describe on my example, I can't succeed to update value2 with accumulate2 (purrr package).
Maybe someone could give me some advices to do this.
Thank you.
input <- data.frame(group=c(1,1,1,2,2,2,2),
time=c(1,2,3,1,2,3,4),
value1=c(4,2,2,3,3,3,3),
value2=c(4,2,1,3,3,1,1))
input<-arrange(input, group,time)
my_function <- function(df) {
df %>%
as_tibble() %>%
group_by(group) %>%
mutate(value2=purrr::accumulate2(.x = value2, .y = ((value1==lag(value1))
& (lag(value2)==value1)
& (value1!=value2))[-1],
.f = function(.i_1, .i, .y) {
if (.y) {.i_1} else {.i}
}) %>% unlist())
}
> input
group time value1 value2
1 1 1 4 4
2 1 2 2 2
3 1 3 2 1
4 2 1 3 3
5 2 2 3 3
6 2 3 3 1
7 2 4 3 1
output <- my_function(input)
> output
group time value1 value2
1 1 1 4 4
2 1 2 2 2
3 1 3 2 2 -> data change (OK)
4 2 1 3 3
5 2 2 3 3
6 2 3 3 3 -> data change (OK)
7 2 4 3 1 -> no data change / should be replaced by 3
It seems that your problem lies in your algorithm. Unfortunately, as you didn't explain it here, we cannot help you in that matter.
purrr::accumulate2 can be hard to use, so I advise you to split your code as much as possible. This will make your code much more readable, and will make debugging and finding errors much easier.
For instance, consider this:
library(tidyverse)
input <- data.frame(group=c(1,1,1,2,2,2,2),
time=c(1,2,3,1,2,3,4),
value1=c(4,2,2,3,3,3,3),
value2=c(4,2,1,3,3,1,1))
input <- arrange(input, group,time)
#document your functions when it
#' #param .i_1 this is ...
#' #param .i this is ...
#' #param .y this is ...
my_accu_function = function(.i_1, .i, .y) {
if(.y) {.i_1} else {.i}
}
my_function <- function(df) {
df %>%
as_tibble() %>%
group_by(group) %>%
mutate(
cond = value1==lag(value1) &
lag(value2)==value1 &
value1!=value2,
value2_update=purrr::accumulate2(.x = value2,
.y = cond[-1],
.f = my_accu_function) %>% unlist()
)
}
input
#> group time value1 value2
#> 1 1 1 4 4
#> 2 1 2 2 2
#> 3 1 3 2 1
#> 4 2 1 3 3
#> 5 2 2 3 3
#> 6 2 3 3 1
#> 7 2 4 3 1
output = my_function(input)
output
#> # A tibble: 7 x 6
#> # Groups: group [2]
#> group time value1 value2 cond value2_update
#> <dbl> <dbl> <dbl> <dbl> <lgl> <dbl>
#> 1 1 1 4 4 FALSE 4
#> 2 1 2 2 2 FALSE 2
#> 3 1 3 2 1 TRUE 2
#> 4 2 1 3 3 FALSE 3
#> 5 2 2 3 3 FALSE 3
#> 6 2 3 3 1 TRUE 3
#> 7 2 4 3 1 FALSE 1
stopifnot(output$value2_update[7]==3)
#> Error: output$value2_update[7] == 3 is not TRUE
Created on 2022-05-11 by the reprex package (v2.0.1)
You can see that cond is FALSE in the end, so accumulate2 did its job putting the current value 1 and not the previous value 3.
If you explain your algorithm to us, maybe we can help you with setting the proper condition cond so that you get the right output.

How to summarize across multiple columns with condition on another (grouped) column with dplyr?

I need to summarize a data.frame across multiple columns in a generic way:
the first summarize operation is easy, e.g. a simple median, and is straightforward;
the second summarize then includes a condition on another column, e.g. taking the value where these is a minimum (by group) in another column:
set.seed(4)
myDF = data.frame(i = rep(1:3, each=3),
j = rnorm(9),
a = sample.int(9),
b = sample.int(9),
c = sample.int(9),
d = 'foo')
# i j a b c d
# 1 1 0.2167549 4 5 5 foo
# 2 1 -0.5424926 7 7 4 foo
# 3 1 0.8911446 3 9 1 foo
# 4 2 0.5959806 8 6 8 foo
# 5 2 1.6356180 6 8 3 foo
# 6 2 0.6892754 1 4 6 foo
# 7 3 -1.2812466 9 1 7 foo
# 8 3 -0.2131445 5 2 2 foo
# 9 3 1.8965399 2 3 9 foo
myDF %>% group_by(i) %>% summarize(across(where(is.numeric), median, .names="med_{col}"),
best_a = a[[which.min(j)]],
best_b = b[[which.min(j)]],
best_c = c[[which.min(j)]])
# # A tibble: 3 x 8
# i med_j med_a med_b med_c best_a best_b best_c
# * <int> <dbl> <int> <int> <int> <int> <int> <int>
# 1 1 0.217 4 7 4 7 7 4
# 2 2 0.689 6 6 6 8 6 8
# 3 3 -0.213 5 2 7 9 1 7
How can I define this second summarize operation in a generic way (i.e., not manually as done above)?
Hence I would need something like this (which obviously does not work as j is not recognized):
myfns = list(med = ~median(.),
best = ~.[[which.min(j)]])
myDF %>% group_by(i) %>% summarize(across(where(is.numeric), myfns, .names="{fn}_{col}"))
# Error: Problem with `summarise()` input `..1`.
# x object 'j' not found
# ℹ Input `..1` is `across(where(is.numeric), myfns, .names = "{fn}_{col}")`.
# ℹ The error occurred in group 1: i = 1.
Use another across to get corresponding values in column a:c where j is minimum.
library(dplyr)
myDF %>%
group_by(i) %>%
summarize(across(where(is.numeric), median, .names="med_{col}"),
across(a:c, ~.[which.min(j)],.names = 'best_{col}'))
# i med_j med_a med_b med_c best_a best_b best_c
#* <int> <dbl> <int> <int> <int> <int> <int> <int>
#1 1 0.217 4 7 4 7 7 4
#2 2 0.689 6 6 6 8 6 8
#3 3 -0.213 5 2 7 9 1 7
To do it in the same across statement :
myDF %>%
group_by(i) %>%
summarize(across(where(is.numeric), list(med = median,
best = ~.[which.min(j)]),
.names="{fn}_{col}"))

Getting the same value when estimating the mean across different categories

I am trying to estimate the mean value of a lab test reading igg1_norm across different categories of a variable type (5 categories).
db <- forg %>%
group_by(forg$type)%>%
summarise(mean=mean(forg$igg1_norm, na.rm=TRUE),sd=sd(forg$igg1_norm, na.rm=TRUE),lower = mean(forg$igg1_norm, na.rm=TRUE) - sd(forg$igg1_norm, na.rm=TRUE), upper = mean(forg$igg1_norm, na.rm=TRUE) + sd(forg$igg1_norm, na.rm=TRUE))
My data looks like as per below
cowidfarm type time_num igg1_norm igg2_norm
<chr> <fct> <fct> <labelled> <labelled>
1 LM1047 3 1 0.1080482 0.4526854
2 LM1047 3 2 0.1833975 0.6029548
3 LM1047 3 3 0.1704118 0.5394913
4 LM1050 1 1 0.2883397 0.4347826
5 LM1050 1 2 0.1453905 0.5655340
6 LM1050 1 3 0.3302948 0.4962779
7 LM1134 3 1 0.4498922 0.6672078
8 LM1134 3 2 0.2641302 0.6204986
9 LM1134 3 3 0.3207913 0.5074442
10 LM1221 3 1 1.2184955 0.8653846
I get no errors when running the code but the output is odd as I get only one value when I was expecting 5 values (one for each category).
mean sd lower upper
1 0.4046562 0.3239133 0.08074287 0.7285695
Does anyone have an idea of what I am doing wrong? Any help is dearly appreciated
Not sure what you tried but when I remove forg$ from your code I get:
library(dplyr)
forg %>%
group_by(type)%>%
summarise(mean=mean(igg1_norm, na.rm=TRUE),sd=sd(igg1_norm, na.rm=TRUE),
lower = mean(igg1_norm, na.rm=TRUE) - sd(igg1_norm, na.rm=TRUE),
upper = mean(igg1_norm, na.rm=TRUE) + sd(igg1_norm, na.rm=TRUE))
#> # A tibble: 2 x 5
#> type mean sd lower upper
#> * <int> <dbl> <dbl> <dbl> <dbl>
#> 1 1 0.255 0.0969 0.158 0.352
#> 2 3 0.388 0.383 0.00469 0.771
DATA
forg <- read.table(text = "cowidfarm type time_num igg1_norm igg2_norm
1 LM1047 3 1 0.1080482 0.4526854
2 LM1047 3 2 0.1833975 0.6029548
3 LM1047 3 3 0.1704118 0.5394913
4 LM1050 1 1 0.2883397 0.4347826
5 LM1050 1 2 0.1453905 0.5655340
6 LM1050 1 3 0.3302948 0.4962779
7 LM1134 3 1 0.4498922 0.6672078
8 LM1134 3 2 0.2641302 0.6204986
9 LM1134 3 3 0.3207913 0.5074442
10 LM1221 3 1 1.2184955 0.8653846 ", header = TRUE)

Easy way to convert text columns to numeric

I have this example dataset
x <- c("hot", "cold", "warm", "hot", "hot")
y <- c("happy", "content", "happy", "sad", "annoyed")
df <- data.frame(x, y)
I want to find a quick way to convert the text to numbers, it doesn't matter which order the numbers are.
So the output would be:
x y
1 1
2 2
3 1
1 3
1 4
Many Thanks
With Base R:
df[] <- lapply(df, function(x) as.numeric(as.factor(x)))
df
#> x y
#> 1 2 3
#> 2 1 2
#> 3 3 3
#> 4 2 4
#> 5 2 1
With purrr:
library(purrr)
df %>% map(as.factor) %>% map_dfc(as.numeric)
#> # A tibble: 5 x 2
#> x y
#> <dbl> <dbl>
#> 1 2 3
#> 2 1 2
#> 3 3 3
#> 4 2 4
#> 5 2 1
Keep track of the labels with labelled:
df <- df %>% map(as.factor) %>% map_dfc(labelled::to_labelled)
df
#> # A tibble: 5 x 2
#> x y
#> <dbl+lbl> <dbl+lbl>
#> 1 2 [hot] 3 [happy]
#> 2 1 [cold] 2 [content]
#> 3 3 [warm] 3 [happy]
#> 4 2 [hot] 4 [sad]
#> 5 2 [hot] 1 [annoyed]
df$x
#> <labelled<double>[5]>
#> [1] 2 1 3 2 2
#>
#> Labels:
#> value label
#> 1 cold
#> 2 hot
#> 3 warm
Or keep the numbers next to the original values in a new column:
df[paste0(names(df), "_num")] <- lapply(df, function(x) as.numeric(as.factor(x)))
df
#> x y x_num y_num
#> 1 hot happy 2 3
#> 2 cold content 1 2
#> 3 warm happy 3 3
#> 4 hot sad 2 4
#> 5 hot annoyed 2 1
If you want to change only the character columns to numeric:
library(purrr)
df %>% map_if(is.character, as.factor) %>% map_dfc(as.numeric)
df %>% map_if(is.character, as.factor) %>% map_dfc(labelled::to_labelled)
Or choose them by name:
library(purrr)
cols <- c("x", "y")
df %>% map_at(cols, as.factor) %>% map_dfc(as.numeric)
df %>% map_at(cols, as.factor) %>% map_dfc(labelled::to_labelled)
df[paste0(cols, "_num")] <- lapply(df[cols], function(x) as.numeric(as.factor(x)))
You could use rapply:
rapply(type.convert(df), function(x)as.integer(factor(x, unique(x))),'factor',how = 'replace')
x y
1 1 1
2 2 2
3 3 1
4 1 3
5 1 4
Maybe try this with dplyr:
library(dplyr)
#Code
newdf <- df %>% mutate(across(everything(),~as.numeric(as.factor(.))))
Output:
x y
1 2 3
2 1 2
3 3 3
4 2 4
5 2 1
In order to see the values, you can try this:
#Code 2
newdf2 <- df %>% mutate(across(everything(),~as.factor(.))) %>%
mutate(across(everything(),.fns = list(value = ~ as.numeric(.))))
Output:
x y x_value y_value
1 hot happy 2 3
2 cold content 1 2
3 warm happy 3 3
4 hot sad 2 4
5 hot annoyed 2 1
If we add a numeric variable, this should work:
#Code 3
newdf <- df %>% mutate(across(x:y,~as.factor(.))) %>%
mutate(across(x:y,.fns = list(value = ~ as.numeric(.))))
Output:
x y number x_value y_value
1 hot happy 10 2 3
2 cold content 20 1 2
3 warm happy 30 3 3
4 hot sad 40 2 4
5 hot annoyed 50 2 1
We can use match
df[] <- lapply(df, function(x) match(x, unique(x)))

Add original values for columns after group by

For the dataframe below I want to add the original values for Var_x after a group_by on ID and event and a max() on quest, but I cannot get my code right. Any suggestions? By the way, in my original dataframe more than 1 column needs to be added.
df <- data.frame(ID = c(1,1,1,1,1,1,2,2,2,3,3,3),
quest = c(1,1,2,2,3,3,1,2,3,1,2,3),
event = c("A","B","A","B","A",NA,"C","D","C","D","D",NA),
VAR_X = c(2,4,3,6,3,NA,6,4,5,7,5,NA))
Code:
df %>%
group_by(ID,event) %>%
summarise(quest = max(quest))
Desired output:
ID quest event VAR_X
1 1 2 B 6
2 1 3 A 3
3 2 2 D 4
4 2 3 C 5
5 3 2 D 5
Start by omiting the na values and in the end do an inner_join with the original data set.
df %>%
na.omit() %>%
group_by(ID, event) %>%
summarise(quest = max(quest)) %>%
inner_join(df, by = c("ID", "event", "quest"))
## A tibble: 5 x 4
## Groups: ID [3]
# ID event quest VAR_X
# <dbl> <fct> <dbl> <dbl>
#1 1 A 3 3
#2 1 B 2 6
#3 2 C 3 5
#4 2 D 2 4
#5 3 D 2 5
df %>%
drop_na() %>% # remove if necessary ..
group_by(ID, event) %>%
filter(quest == max(quest)) %>%
ungroup()
# A tibble: 5 x 4
# ID quest event VAR_X
#<dbl> <dbl> <chr> <dbl>
# 1 1 2 B 6
# 2 1 3 A 3
# 3 2 2 D 4
# 4 2 3 C 5
# 5 3 2 D 5

Resources