I have a data frame that looks like this:
the col1 defines the start of a range when the direction is " + " while the col2 establishes the beginning of a range when the direction is " - ".
library(tidyverse)
df <- tibble(col1=c(1,10,100,40,1000), col2=c(15,20,50,80,2000),
direction=c("+","+","-","+","+"), score=c(50,100,300,10,300))
df
#> # A tibble: 5 × 4
#> col1 col2 direction score
#> <dbl> <dbl> <chr> <dbl>
#> 1 1 15 + 50
#> 2 10 20 + 100
#> 3 100 50 - 300
#> 4 40 80 + 10
#> 5 1000 2000 + 300
Created on 2022-07-28 by the reprex package (v2.0.1)
By considering the direction, I want to extract from the rows with overlapping ranges the ones with the highest score.
I want my data to look like this.
#> col1 col2 direction score
#> <dbl> <dbl> <chr> <dbl>
#> 1 10 20 + 100
#> 3 100 50 - 300
#> 5 1000 2000 + 300
Any ideas and help are highly appreciated.
We could use slice_max after grouping by rleid on the 'direction'
library(dplyr)
library(data.table)
df %>%
group_by(grp = rleid(direction)) %>%
slice_max(n = 1, order_by = score) %>%
ungroup %>%
select(-grp)
-output
# A tibble: 3 × 4
col1 col2 direction score
<dbl> <dbl> <chr> <dbl>
1 10 20 + 100
2 100 50 - 300
3 1000 2000 + 300
I am using the dplyr package. Let's suppose I have the below table.
Group
count
A
20
A
10
B
30
B
35
C
50
C
60
My goal is to create a summary table that contains the mean per each group, and also, the percentage of the mean of each group compared to the total means added together. So the final table will look like this:
Group
avg
prcnt_of_total
A
15
.14
B
32.5
.31
C
55
.53
For example, 0.14 is the result of the following calculation: 15/(15+32.5+55)
Right now, I was only able to produce the first column code that calculates the mean for each group:
summary_df<- df %>%
group_by(Group)%>%
summarise(avg=mean(count))
I still don't know how to produce the prcnt_of_total column. Any suggestions?
You can use the following code:
df <- read.table(text="Group count
A 20
A 10
B 30
B 35
C 50
C 60", header = TRUE)
library(dplyr)
df %>%
group_by(Group) %>%
summarise(avg = mean(count)) %>%
ungroup() %>%
mutate(prcnt_of_total = prop.table(avg))
#> # A tibble: 3 × 3
#> Group avg prcnt_of_total
#> <chr> <dbl> <dbl>
#> 1 A 15 0.146
#> 2 B 32.5 0.317
#> 3 C 55 0.537
Created on 2022-07-14 by the reprex package (v2.0.1)
We can drop the group in summarise itself.
library(dplyr)
df1 %>%
group_by(Group) %>%
summarise(avg = mean(count), .groups = "drop") %>%
mutate(prcnt_of_total = avg/sum(avg))
#> # A tibble: 3 x 3
#> Group avg prcnt_of_total
#> <chr> <dbl> <dbl>
#> 1 A 15 0.146
#> 2 B 32.5 0.317
#> 3 C 55 0.537
On another note, I am not sure if getting the average divided by the sum of averages is a meaningful metric unless we are sure to have the same number of entries per group. Given that, I suggested another solution as well.
## if you always have the same number of rows between the groups
df1 %>%
group_by(Group) %>%
summarise(avg = mean(count),
prcnt_of_total = sum(count)/sum(.$count))
#> # A tibble: 3 x 3
#> Group avg prcnt_of_total
#> <chr> <dbl> <dbl>
#> 1 A 15 0.146
#> 2 B 32.5 0.317
#> 3 C 55 0.537
Data:
read.table(text = "Group count
A 20
A 10
B 30
B 35
C 50
C 60",
header = T, stringsAsFactors = F) -> df1
You can do this:
df %>%
group_by(Group) %>%
summarize(avg = mean(count), prcent_of_total = sum(count)/sum(df$count))
Output:
Group avg prcent_of_total
<chr> <dbl> <dbl>
1 A 15 0.146
2 B 32.5 0.317
3 C 55 0.537
data.table is similar:
library(data.table)
setDT(df)[,.(avg = mean(count), prcent_of_total = sum(count)/sum(df$count)),Group]
The sample data frame:
no <- rep(1:5, each=2)
type <- rep(LETTERS[1:2], times=5)
set.seed(4)
value <- round(runif(10, 10, 30))
df <- data.frame(no, type, value)
df
no type value
1 1 A 22
2 1 B 10
3 2 A 16
4 2 B 16
5 3 A 26
6 3 B 15
7 4 A 24
8 4 B 28
9 5 A 29
10 5 B 11
Now what I want is to calculate the % value of each type of type (A or B) and create separate columns. Desired output is something like this:
no pct_A pct_B total_value
1 1 68.75000 31.25000 32
2 2 50.00000 50.00000 32
3 3 63.41463 36.58537 41
4 4 46.15385 53.84615 52
5 5 72.50000 27.50000 40
What I have tried so far (This gives the right output but the process seems very sub-optimal):
df %>%
group_by(no) %>%
mutate(total_value= sum(value))-> df
df %>%
mutate(pct_A=ifelse(type=='A', (value/total_value) *100, 0),
pct_B=ifelse(type=='B', (value/total_value) *100, 0)) %>%
group_by(no) %>%
summarise(pct_A=sum(pct_A),
pct_B=sum(pct_B)) %>%
ungroup() %>%
merge(df) %>%
distinct(no, .keep_all = T) %>%
select(-type, -value)
Is there any better way to do that? Especially using dplyr?
I looked for other answers too, but no help. This one came closer:
R Create new column of values based on the factor levels of another column
You could do it in base using aggregate.
do.call(data.frame, aggregate(value ~ no, df, \(x) c(proportions(x), sum(x)))) |>
setNames(c('no', 'pct_A', 'pct_B', 'total_value'))
# no pct_A pct_B total_value
# 1 1 0.6875000 0.3125000 32
# 2 2 0.5000000 0.5000000 32
# 3 3 0.6341463 0.3658537 41
# 4 4 0.4615385 0.5384615 52
# 5 5 0.7250000 0.2750000 40
For each no we can calculate sum and ratio then get the data in wide format.
library(dplyr)
library(tidyr)
df %>%
group_by(no) %>%
mutate(total_value = sum(value),
value = prop.table(value) * 100) %>%
ungroup %>%
pivot_wider(names_from = type, values_from = value, names_prefix = 'pct_')
# no total_value pct_A pct_B
# <int> <dbl> <dbl> <dbl>
#1 1 32 68.8 31.2
#2 2 32 50 50
#3 3 41 63.4 36.6
#4 4 52 46.2 53.8
#5 5 40 72.5 27.5
Here are two more ways to do this.
We could use purrr::map_dfc. However, setting up the correct column names is kind of cumbersome:
library(dplyr)
library(purrr)
df %>%
group_by(no) %>%
summarise(total_value = sum(value),
map_dfc(unique(type) %>% set_names(., paste0("pct_",.)),
~ sum((type == .x) * value) / total_value * 100)
)
#> # A tibble: 5 x 4
#> no total_value pct_A pct_B
#> <int> <dbl> <dbl> <dbl>
#> 1 1 32 68.8 31.2
#> 2 2 32 50 50
#> 3 3 41 63.4 36.6
#> 4 4 52 46.2 53.8
#> 5 5 40 72.5 27.5
Alternatively we can use dplyover::over (disclaimer: I'm the maintainer) which allows us to create names on the fly in a across-like way:
library(dplyover) # https://github.com/TimTeaFan/dplyover
df %>%
group_by(no) %>%
summarise(total_value = sum(value),
over(dist_values(type), # alternatively `unique(type)`
~ sum((type == .x) * value) / total_value * 100,
.names = "pct_{x}")
)
#> # A tibble: 5 x 4
#> no total_value pct_A pct_B
#> <int> <dbl> <dbl> <dbl>
#> 1 1 32 68.8 31.2
#> 2 2 32 50 50
#> 3 3 41 63.4 36.6
#> 4 4 52 46.2 53.8
#> 5 5 40 72.5 27.5
Created on 2021-09-17 by the reprex package (v2.0.1)
Performance-wise both approaches should be faster compared to data-rectangling approaches such as pivot_wider (but I haven't tested this specific scenario).
I have a data frame with ~150K rows and 77 categorical variables in a form such as the below. How do I found the Score and count for each category
One numeric variable and 77 grouping variables
students<-data.frame(ID = c("A","B","C","D"), Gender = c("M","F","F","F"), Socioeconomic = c("Low","Low","Medium","High"), Subject = c("Maths","Maths","Science", "Science"),
Scores = c(45,98, 50,38))
That is I do not want to have to go through each categorical column individually 77 times but want a tibble that contains a list of the outputs for each of the below
students %>% group_by(Gender) %>% summarise(Mean.score = mean(Scores), Count = length(ID))
students %>% group_by(Socioeconomic) %>% summarise(Mean.score = mean(Scores), Count = length(ID))
students %>% group_by(Subject) %>% summarise(Mean.score = mean(Scores), Count = length(ID))```
Here are two options:
library(tidyverse)
# map successively over each categorical column
map(students %>% select(-Scores, -ID) %>% names() %>% set_names(),
~ students %>%
group_by_at(.x) %>%
summarise(Mean.score = mean(Scores),
Count = n())
)
$Gender
# A tibble: 2 x 3
Gender Mean.score Count
<fct> <dbl> <int>
1 F 62 3
2 M 45 1
$Socioeconomic
# A tibble: 3 x 3
Socioeconomic Mean.score Count
<fct> <dbl> <int>
1 High 38 1
2 Low 71.5 2
3 Medium 50 1
$Subject
# A tibble: 2 x 3
Subject Mean.score Count
<fct> <dbl> <int>
1 Maths 71.5 2
2 Science 44 2
# Convert to long format, group, then summarize
students %>%
gather(key, value, -ID, -Scores) %>%
group_by(key, value) %>%
summarise(Count=n(),
Mean.score=mean(Scores))
key value Count Mean.score
<chr> <chr> <int> <dbl>
1 Gender F 3 62
2 Gender M 1 45
3 Socioeconomic High 1 38
4 Socioeconomic Low 2 71.5
5 Socioeconomic Medium 1 50
6 Subject Maths 2 71.5
7 Subject Science 2 44
I have the following data with ID and value:
id <- c("1103-5","1103-5","1104-2","1104-2","1104-4","1104-4","1106-2","1106-2","1106-3","1106-3","2294-1","2294-1","2294-2","2294-2","2294-2","2294-3","2294-3","2294-3","2294-4","2294-4","2294-5","2294-5","2294-5","2300-1","2300-1","2300-2","2300-2","2300-4","2300-4","2321-1","2321-1","2321-2","2321-2","2321-3","2321-3","2321-4","2321-4","2347-1","2347-1","2347-2","2347-2")
value <- c(6,3,6,3,6,3,6,3,6,3,3,6,9,3,6,9,3,6,3,6,9,3,6,9,6,9,6,9,6,9,3,9,3,9,3,9,3,9,6,9,6)
If you notice, there are multiple values for the same id. What I'd like to do is get the value that are only 3 and 6 only if the IDs are the same. for eg. ID "1103-5" has both 3 and 6, so it should be in the list, but not "2347-2"
I'm using R
One method I tried is the following, but it gives me everything with value 3 and 6.
d <- data.frame(id, value)
group36 <- d[d$value == 3 | d$value == 6,]
and
d %>% group_by(id) %>% filter(3 == value | 6 == value)
The output should be like this:
id value
1103-5 6
1103-5 3
1104-2 6
1104-2 3
1104-4 6
1104-4 3
1106-2 6
1106-2 3
1106-3 6
1106-3 3
2294-1 3
2294-1 6
2294-2 3
2294-2 6
2294-3 3
2294-3 6
2294-4 3
2294-4 6
2294-5 3
2294-5 6
d<-group_by(d,id)
filter(d,any(value==3),any(value==6))
This gives you all the IDs where there is both a value of 3 (somewhere) AND a value of 6 (somewhere). Mind you, your data contains some IDs with THREE values. In these cases, if both 3 and 6 are present, it will be included in the result.
If you want to exclude those lines that remain which done equal 3 or 6, add this:
filter(d,value==3 | value==6)
If you want to exclude IDs that also have 3 and 6 as values but also have OTHER values, use this:
filter(d,any(value==3),any(value==6),value==3 | value==6)
Not sure if this is what you want. We can filter rows that equal to either 3 or 6 then convert from long to wide format and keep only columns which have both 3 and 6 values. After that, convert back to long format.
library(dplyr)
library(tidyr)
id <- c("1103-5","1103-5","1104-2","1104-2","1104-4","1104-4","1106-2","1106-2",
"1106-3","1106-3","2294-1","2294-1","2294-2","2294-2","2294-2",
"2294-3","2294-3","2294-3","2294-4","2294-4","2294-5","2294-5","2294-5",
"2300-1","2300-1","2300-2","2300-2","2300-4","2300-4","2321-1","2321-1",
"2321-2","2321-2","2321-3","2321-3","2321-4","2321-4","2347-1","2347-1","2347-2","2347-2")
value <- c(6,3,6,3,6,3,6,3,6,3,3,6,9,3,6,9,3,6,3,6,9,3,6,9,6,9,6,9,6,9,3,9,3,9,3,9,3,9,6,9,6)
d <- data.frame(id, value)
d %>%
group_by(id) %>%
filter(value %in% c(3, 6)) %>%
mutate(rows = 1:n()) %>%
spread(key = id, value) %>%
select_if(~ all(!is.na(.)))
#> # A tibble: 2 x 11
#> rows `1103-5` `1104-2` `1104-4` `1106-2` `1106-3` `2294-1` `2294-2`
#> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 6 6 6 6 6 3 3
#> 2 2 3 3 3 3 3 6 6
#> # ... with 3 more variables: `2294-3` <dbl>, `2294-4` <dbl>,
#> # `2294-5` <dbl>
d %>%
group_by(id) %>%
filter(value %in% c(3, 6)) %>%
mutate(rows = 1:n()) %>%
spread(key = id, value) %>%
select_if(~ all(!is.na(.))) %>%
select(-rows) %>%
gather(id, value)
#> # A tibble: 20 x 2
#> id value
#> <chr> <dbl>
#> 1 1103-5 6
#> 2 1103-5 3
#> 3 1104-2 6
#> 4 1104-2 3
#> 5 1104-4 6
#> 6 1104-4 3
#> 7 1106-2 6
#> 8 1106-2 3
#> 9 1106-3 6
#> 10 1106-3 3
#> 11 2294-1 3
#> 12 2294-1 6
#> 13 2294-2 3
#> 14 2294-2 6
#> 15 2294-3 3
#> 16 2294-3 6
#> 17 2294-4 3
#> 18 2294-4 6
#> 19 2294-5 3
#> 20 2294-5 6
Created on 2018-07-01 by the reprex package (v0.2.0.9000).