Calculating proportions based on conditions - r

please see sample data below:
data <- data.frame(q1=c(3,4,5,2,1,2,4),
q2=c(3,4,4,5,4,3,2),
q3=c(2,3,2,3,1,2,3),
q4=c(3,4,4,4,4,5,5))
I would like to create a another column which shows the percent of 4/5 responses. The output I am hoping to get looks something like this. Any help is appreciated, thank you!
q1 q2 q3 q4 percent
1 3 3 2 3 0.00
2 4 4 3 4 0.75
3 5 4 2 4 0.75
4 2 5 3 4 0.50
5 1 4 1 4 0.50
6 2 3 2 5 0.25
7 4 2 3 5 0.50

Using rowMeans
library(dplyr)
data %>%
mutate(percent = rowMeans(across(everything(), ~ .x %in% 4:5)))
-output
q1 q2 q3 q4 percent
1 3 3 2 3 0.00
2 4 4 3 4 0.75
3 5 4 2 4 0.75
4 2 5 3 4 0.50
5 1 4 1 4 0.50
6 2 3 2 5 0.25
7 4 2 3 5 0.50

One possible solution:
data$percent = rowMeans(data>3)
Or
data$percent = apply(data, 1, \(x) mean(x %in% 4:5))
q1 q2 q3 q4 percent
1 3 3 2 3 0.00
2 4 4 3 4 0.75
3 5 4 2 4 0.75
4 2 5 3 4 0.50
5 1 4 1 4 0.50
6 2 3 2 5 0.25
7 4 2 3 5 0.50

library(dplyr)
data <- data.frame(q1=c(3,4,5,2,1,2,4),
q2=c(3,4,4,5,4,3,2),
q3=c(2,3,2,3,1,2,3),
q4=c(3,4,4,4,4,5,5))
percent_4_5 <- function(x) {
(sum(x == 4) + sum(x == 5)) / length(x)
}
data %>% rowwise() %>% mutate(percent = percent_4_5(c_across(starts_with("q")))) %>% ungroup()

Another possible solution without using dplyr
library(magrittr)
data$percent <- (data > 3) %>% as.data.frame() %>% apply(., 1, mean)
True is considered as 1 and False is considered as 0 when counting.
Output:
q1 q2 q3 q4 percent
1 3 3 2 3 0.00
2 4 4 3 4 0.75
3 5 4 2 4 0.75
4 2 5 3 4 0.50
5 1 4 1 4 0.50
6 2 3 2 5 0.25
7 4 2 3 5 0.50

Related

Find the value that occurs most frequently and indicate relative frequency

I observe 12 responses of 2 survey participants.
data = data.frame(id = c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2), response = c(2,2,3,3,6,3,6,7,3,1,4,3,3,3,6,4,2,6,7,3,2,1,5,6))
data
id response
1 1 2
2 1 2
3 1 3
4 1 3
5 1 6
6 1 3
7 1 6
8 1 7
9 1 3
10 1 1
11 1 4
12 1 3
13 2 3
14 2 3
15 2 6
16 2 4
17 2 2
18 2 6
19 2 7
20 2 3
21 2 2
22 2 1
23 2 5
24 2 6
Now I want to add 2 things to the data of each survey participant:
a) The most frequent value of this survey participant
b) the relative frequency of the most frequent value
How can I add these things using dplyr:
data %>%
group_by(id) %>%
mutate(most_frequent_value = ?,
relative_frequency_of_most_frequent_value = ?)
I'd probably use a two step solution. First, create a data.frame of frequency/relative frequency. Then join to it. We use slice(which.max()), because it will return one row. Using slice_max may return multiple rows.
library(tidyverse)
# count by id, response, calculate rel frequency
# rename columns to make inner_join easier
freq_table <- dd %>%
count(id, response) %>%
group_by(id) %>%
mutate(rel_freq = n / sum(n)) %>%
select(id, most_frequent_response = response, rel_freq)
# inner join to sliced freq_table (grouping by id is preserved)
dd %>%
inner_join(freq_table %>% slice(which.max(rel_freq)))
# id response most_frequent_response rel_freq
# 1 1 2 3 0.4166667
# 2 1 2 3 0.4166667
# 3 1 3 3 0.4166667
# 4 1 3 3 0.4166667
# 5 1 6 3 0.4166667
# ...
You could try:
table(data$id, data$response) %>%
as.data.frame() %>%
setNames(c("id", "response", "n")) %>%
group_by(id) %>%
slice_max(n, 1) %>%
group_by(response) %>%
filter(n() > 1) %>%
mutate(ratio = c(n[1]/sum(n), n[2]/sum(n)))
#> # A tibble: 2 x 4
#> # Groups: response [1]
#> id response n ratio
#> <fct> <fct> <int> <dbl>
#> 1 1 3 5 0.625
#> 2 2 3 3 0.375
Does this work:
data %>% group_by(id, response) %>% mutate(n = n()) %>%
ungroup() %>% group_by(id) %>%
mutate(most_frequent_value = response[n == max(n)][1],
relative_frequency_of_most_frequent_value = max(n)/n())
# A tibble: 24 x 5
# Groups: id [2]
id response n most_frequent_value relative_frequency_of_most_frequent_value
<dbl> <dbl> <int> <dbl> <dbl>
1 1 2 2 3 0.417
2 1 2 2 3 0.417
3 1 3 5 3 0.417
4 1 3 5 3 0.417
5 1 6 2 3 0.417
6 1 3 5 3 0.417
7 1 6 2 3 0.417
8 1 7 1 3 0.417
9 1 3 5 3 0.417
10 1 1 1 3 0.417
11 1 4 1 3 0.417
12 1 3 5 3 0.417
13 2 3 3 3 0.25
14 2 3 3 3 0.25
15 2 6 3 3 0.25
16 2 4 1 3 0.25
17 2 2 2 3 0.25
18 2 6 3 3 0.25
19 2 7 1 3 0.25
20 2 3 3 3 0.25
21 2 2 2 3 0.25
22 2 1 1 3 0.25
23 2 5 1 3 0.25
24 2 6 3 3 0.25
>

Group_by then Filter and compute in R

This is my data set
You can get the data form this link( If can't ,please inform me)
https://www.dropbox.com/s/1n9hpyhcniaghh5/table.csv?dl=0
LABEL DATE TAU TYPE x y z
1 A 1 2 1 0.75 7 16
2 A 1 2 0 0.41 5 18
3 A 1 2 1 0.39 6 14
4 A 2 3 0 0.65 5 14
5 A 2 3 1 0.55 7 19
6 A 2 3 1 0.69 5 19
7 A 2 3 0 0.66 7 19
8 A 3 1 0 0.38 8 15
9 A 3 1 0 0.02 5 16
10 A 3 1 0 0.71 8 13
11 B 1 2 1 0.25 9 18
12 B 1 2 0 0.06 8 20
13 B 1 2 1 0.60 8 20
14 B 1 2 0 0.56 6 13
15 B 1 3 1 0.50 8 19
16 B 1 3 0 0.04 8 16
17 B 2 1 1 0.04 5 15
18 B 2 1 1 0.75 5 13
19 B 2 1 0 0.44 8 18
20 B 2 1 1 0.52 9 13
I want to filter data by group with multiple conditions. And the conditions is
the number of rows for each type(0,1) of TYPE variable by group must
bigger than 1
the number of rows for each type must be equal
(For example: the number of rows for type 1 is equal to the number of rows for type 0 for each group)
I had tried many times... And finally I get this code and this output
table %>% group_by(label,date,tau,type) %>% filter(n()>1) %>% filter(length(type==1)==length(type==0))
# A tibble: 16 x 7
# Groups: label, date, tau, type [7]
LABEL DATE TAU TYPE x y z
<fctr> <int> <int> <int> <dbl> <int> <int>
1 A 1 2 1 0.75 7 16
2 A 1 2 1 0.39 6 14
3 A 2 3 0 0.65 5 14
4 A 2 3 1 0.55 7 19
5 A 2 3 1 0.69 5 19
6 A 2 3 0 0.66 7 19
7 A 3 1 0 0.38 8 15
8 A 3 1 0 0.02 5 16
9 A 3 1 0 0.71 8 13
10 B 1 2 1 0.25 9 18
11 B 1 2 0 0.06 8 20
12 B 1 2 1 0.60 8 20
13 B 1 2 0 0.56 6 13
14 B 2 1 1 0.04 5 15
15 B 2 1 1 0.75 5 13
16 B 2 1 1 0.52 9 13
I was confused about this output I get with this code. I already get rid of the data which didn't meet the condition 1 BUT the data which didn't meet the condition 2 still inside
The result I want is just like the below
LABEL DATE TAU TYPE x y z
<fctr> <int> <int> <int> <dbl> <int> <int>
3 A 2 3 0 0.65 5 14
4 A 2 3 1 0.55 7 19
5 A 2 3 1 0.69 5 19
6 A 2 3 0 0.66 7 19
10 B 1 2 1 0.25 9 18
11 B 1 2 0 0.06 8 20
12 B 1 2 1 0.60 8 20
13 B 1 2 0 0.56 6 13
And if I want to compute value with the function below for each row, how can i code?? Just use the function of mutate()??
f(x,y,z) = 2 * x + y - z / 3 if TYPE == 1
f(x,y,z) = 4 * x - y / 2 + z / 3 if TYPE == 0
I hope there is anyone can help me and I am appreciate for your help! If you need to provide any other information just let me know ~
# example dataset
df = read.table(text = "
LABEL DATE TAU TYPE x y z
1 A 1 2 1 0.75 7 16
2 A 1 2 0 0.41 5 18
3 A 1 2 1 0.39 6 14
4 A 2 3 0 0.65 5 14
5 A 2 3 1 0.55 7 19
6 A 2 3 1 0.69 5 19
7 A 2 3 0 0.66 7 19
8 A 3 1 0 0.38 8 15
9 A 3 1 0 0.02 5 16
10 A 3 1 0 0.71 8 13
11 B 1 2 1 0.25 9 18
12 B 1 2 0 0.06 8 20
13 B 1 2 1 0.60 8 20
14 B 1 2 0 0.56 6 13
15 B 1 3 1 0.50 8 19
16 B 1 3 0 0.04 8 16
17 B 2 1 1 0.04 5 15
18 B 2 1 1 0.75 5 13
19 B 2 1 0 0.44 8 18
20 B 2 1 1 0.52 9 13
", header=T, stringsAsFactors=F)
library(dplyr)
library(tidyr)
# function to use for each row
# (assumes that type can be only 1 or 0)
f = function(t,x,y,z) { ifelse(t == 1,
2 * x + y - z / 3,
4 * x - y / 2 + z / 3) }
df %>%
count(LABEL, DATE, TAU, TYPE) %>% # count rows for each group (based on those combinations)
filter(n > 1) %>% # keep groups with multiple rows
mutate(TYPE = paste0("TYPE_",TYPE)) %>% # update variable
spread(TYPE, n, fill = 0) %>% # reshape data
filter(TYPE_0 == TYPE_1) %>% # keep groups with equal number of rows for type 0 and 1
select(LABEL, DATE, TAU) %>% # keep variables/groups of interest
inner_join(df, by=c("LABEL", "DATE", "TAU")) %>% # join back info
mutate(f_value = f(TYPE,x,y,z)) # apply function
# # A tibble: 8 x 8
# LABEL DATE TAU TYPE x y z f_value
# <chr> <int> <int> <int> <dbl> <int> <int> <dbl>
# 1 A 2 3 0 0.65 5 14 4.76666667
# 2 A 2 3 1 0.55 7 19 1.76666667
# 3 A 2 3 1 0.69 5 19 0.04666667
# 4 A 2 3 0 0.66 7 19 5.47333333
# 5 B 1 2 1 0.25 9 18 3.50000000
# 6 B 1 2 0 0.06 8 20 2.90666667
# 7 B 1 2 1 0.60 8 20 2.53333333
# 8 B 1 2 0 0.56 6 13 3.57333333

Fill value backwards from occurence by group

Problem: How can I fill backwards all rows in a group before an occurrence of a certain value. I am not trying to fill in NA or missing value using zoo na.locf. In the following I would like to fill all previous rows in A with 1.00 before the 1.00 occurs by each ID group, ideally using dplyr.
Input:
data<- data.frame(ID=c(1,1,1,1,2,2,2,3,3,3,4,4,4,4,4),
time=c(1,2,3,4,1,2,3,1,2,3,1,2,3,4,5),
A=c(0.10,0.25,1,0,0.25,1,0.25,0,1,0.10,1,0.10,0.10,0.10,0.05))
ID time A
1 1 0.10
1 2 0.25
1 3 1.00
1 4 0.00
2 1 0.25
2 2 1.00
2 3 0.25
3 1 0.00
3 2 1.00
3 3 0.10
4 1 1.00
4 2 0.10
4 3 0.10
4 4 0.10
4 5 0.05
Desired output:
ID time A
1 1 1.00
1 2 1.00
1 3 1.00
1 4 0.00
2 1 1.00
2 2 1.00
2 3 0.25
3 1 1.00
3 2 1.00
3 3 0.10
4 1 1.00
4 2 0.10
4 3 0.10
4 4 0.10
4 5 0.05
After grouping by ID you can check the cumulative sum of 1's and where it's still below 1 (not yet appeared), replace the A-value with 1:
data %>%
group_by(ID) %>%
mutate(A = replace(A, cumsum(A == 1) < 1, 1))
# Source: local data frame [15 x 3]
# Groups: ID [4]
#
# ID time A
# <dbl> <dbl> <dbl>
# 1 1 1 1.00
# 2 1 2 1.00
# 3 1 3 1.00
# 4 1 4 0.00
# 5 2 1 1.00
# 6 2 2 1.00
# 7 2 3 0.25
# 8 3 1 1.00
# 9 3 2 1.00
# 10 3 3 0.10
# 11 4 1 1.00
# 12 4 2 0.10
# 13 4 3 0.10
# 14 4 4 0.10
# 15 4 5 0.05
Quite similar, you could also use cummax:
data %>% group_by(ID) %>% mutate(A = replace(A, !cummax(A == 1), 1))
And here's a base R approach:
transform(data, A = ave(A, ID, FUN = function(x) replace(x, !cummax(x == 1), 1)))
We can use data.table. Convert the 'data.frame' to 'data.table' (setDT(data)), get the row where 'A' is 1, find the sequence of rows, use that as i to assign (:=) the values in 'A' to 1
library(data.table)
setDT(data)[data[, .I[seq_len(which(A==1))], ID]$V1, A := 1][]
# ID time A
# 1: 1 1 1.00
# 2: 1 2 1.00
# 3: 1 3 1.00
# 4: 1 4 0.00
# 5: 2 1 1.00
# 6: 2 2 1.00
# 7: 2 3 0.25
# 8: 3 1 1.00
# 9: 3 2 1.00
#10: 3 3 0.10
#11: 4 1 1.00
#12: 4 2 0.10
#13: 4 3 0.10
#14: 4 4 0.10
#15: 4 5 0.05
Or we can use ave from base R
data$A[with(data, ave(A==1, ID, FUN = cumsum)<1)] <- 1

Calculate various means from data based on values

I have a large spreadsheet of team member ratings from which I want to calculate how people rated themselves, how they were rated by everyone else on their team, and how they rated everyone else on their team (all averages). I've been trying to do this with dplyr because I have used it before and I think that the group_by will simplify things when doing these calculations. I haven't been able to figure it out so I'm asking for help. I'll try to explain my thinking.
Here's an example dataset:
data <- read.table(text="
Team Rater A1 B1 C1 A2 B2 C2 A3 B3 C3 A4 B4 C4 A5 B5 C5 A6 B6 C6
1 1 2 4 4 2 1 5 2 2 3 4 4 4 3 2 1 NA NA NA
1 2 4 5 4 4 5 1 1 1 5 5 3 1 4 5 2 NA NA NA
1 3 2 1 4 3 5 5 2 1 5 1 1 4 1 1 4 NA NA NA
1 4 4 3 4 3 5 1 3 1 3 5 5 5 5 2 2 NA NA NA
1 5 3 4 5 4 3 3 5 5 4 1 4 5 5 5 1 NA NA NA
2 1 3 5 3 4 1 1 3 4 3 4 3 2 2 2 3 3 5 3
2 2 3 2 3 1 1 3 5 5 1 5 2 3 2 2 1 3 3 2
2 3 3 2 3 3 5 2 4 1 1 1 4 5 3 5 2 1 1 3
2 4 3 3 5 4 3 5 3 1 4 3 1 1 4 2 4 3 5 2
2 5 5 2 1 2 5 5 3 3 1 4 1 5 5 3 3 4 2 5
2 6 3 2 3 5 4 3 2 1 5 4 3 1 1 1 4 2 2 1",header = TRUE)
Each rater provides input on multiple questions for each other team member. The way it is organized, rater 1 answers A1, B1, and C1 about themselves. Rater 2 answers A2, B2, and C2 about themselves, and so on.
Self Ratings
To get someone's rating of themselves I figured it would be something like:
data %>%
group_by(Team) %>%
mutate(self = rowMeans(select(.,ends_with(Rater)), na.rm = TRUE))
It'd be convenient if the column selection was dynamically based on their rater number.
From Others
I was thinking of calculating this based on the average overall rating of that person except the self rating:
data %>%
group_by(Team) %>%
mutate(from = ( (mean(ends_with(Rater)) * n() - self ) / ( n() - 1 ) ) )
Of Others
For this column calculation I was thinking something along the lines of:
data %>%
mutate(of = select(A1:C6, -(ends_with(Rater))) %>% rowMeans(na.rm = TRUE))
(similar to this answer)
Results
Here is an example of what I'm looking for as new columns:
Team Rater self from of
1 1 3.33 3.58 2.75
1 2 3.33 3.33 3.33
1 3 2.67 2.92 2.67
1 4 5.00 3.08 3.00
1 5 3.67 2.67 3.83
If you can help with any of these parts I'd appreciate it!
I would recommend first transforming your data into a "tidy" format with tidyr like such
library(tidyr)
tidy <- data %>% gather(QV,Rating,-Team,-Rater) %>%
separate(QV, into=c("Quest","Rated"), sep=1) %>%
mutate(Rated=as.numeric(Rated)) %>%
filter(!is.na(Rating))
This transforms your data to have the following shape
Team Rater Quest Rated Rating
1 1 1 A 1 2
2 1 2 A 1 4
3 1 3 A 1 2
4 1 4 A 1 4
5 1 5 A 1 3
6 2 1 A 1 3
...
So we turn your data into a long format. Then you can perform each of the queries a bit more directly and merge them together
Reduce(left_join, list(
tidy %>% group_by(Team, Rater) %>% filter(Rated==Rater) %>% summarize(self=mean(Rating)),
tidy %>% group_by(Team, Rated) %>% filter(Rated!=Rater) %>% summarize(others=mean(Rating)) %>% rename(Rater=Rated),
tidy %>% group_by(Team, Rater) %>% filter(Rated!=Rater) %>% summarize(of=mean(Rating))
))
This returns
Team Rater self others of
(int) (dbl) (dbl) (dbl) (dbl)
1 1 1 3.333333 3.583333 2.750000
2 1 2 3.333333 3.333333 3.333333
3 1 3 2.666667 2.916667 2.666667
4 1 4 5.000000 3.083333 3.000000
5 1 5 3.666667 2.666667 3.833333
6 2 1 3.666667 2.866667 2.866667
7 2 2 1.666667 3.466667 2.800000
8 2 3 2.000000 2.933333 2.866667
9 2 4 1.666667 3.133333 3.400000
10 2 5 3.666667 2.533333 3.200000
11 2 6 1.666667 3.000000 2.800000

For each `pop` get frequencies of the elements of `id`

Consider this data:
m = data.frame(pop=c(1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4),
id=c(0,1,1,1,1,1,0,2,1,1,1,2,1,2,2,2))
> m
pop id
1 1 0
2 1 1
3 1 1
4 1 1
5 2 1
6 2 1
7 2 0
8 2 2
9 2 1
10 3 1
11 3 1
12 3 2
13 3 1
14 3 2
15 4 2
16 4 2
I would like to get the frequency of each unique id in each unique pop? For example, the id 1 is present 3 times out of 4 when pop == 1, therefore the frequency of id 1 in pop 1 is 0.75.
I came up with this ugly solution:
out = matrix(0,ncol=3)
for (p in unique(m$pop))
{
for (i in unique(m$id))
{
m1 = m[m$pop == p,]
f = nrow(m1[m1$id == i,])/nrow(m1)
out = rbind(out, c(p, f, i))
}
}
out = out[-1,]
colnames(out) = c("pop", "freq", "id")
# SOLUTION
> out
pop freq id
[1,] 1 0.25 0
[2,] 1 0.75 1
[3,] 1 0.00 2
[4,] 2 0.20 0
[5,] 2 0.60 1
[6,] 2 0.20 2
[7,] 3 0.00 0
[8,] 3 0.60 1
[9,] 3 0.40 2
[10,] 4 0.00 0
[11,] 4 0.00 1
[12,] 4 1.00 2
I am sure there exists a more efficient solution using data.table or table but couldn't find it.
Here's what I might do:
as.data.frame(prop.table(table(m),1))
# pop id Freq
# 1 1 0 0.25
# 2 2 0 0.20
# 3 3 0 0.00
# 4 4 0 0.00
# 5 1 1 0.75
# 6 2 1 0.60
# 7 3 1 0.60
# 8 4 1 0.00
# 9 1 2 0.00
# 10 2 2 0.20
# 11 3 2 0.40
# 12 4 2 1.00
If you want it sorted by pop, you can do that afterwards. Alternately, you could transpose the table with t before converting to data.frame; or use rev(m) and prop.table on dimension 2.
Try:
library(dplyr)
m %>%
group_by(pop, id) %>%
summarise(s = n()) %>%
mutate(freq = s / sum(s)) %>%
select(-s)
Which gives:
#Source: local data frame [8 x 3]
#Groups: pop
#
# pop id freq
#1 1 0 0.25
#2 1 1 0.75
#3 2 0 0.20
#4 2 1 0.60
#5 2 2 0.20
#6 3 1 0.60
#7 3 2 0.40
#8 4 2 1.00
A data.table solution:
setDT(m)[, {div = .N; .SD[, .N/div, keyby = id]}, by = pop]
# pop id V1
#1: 1 0 0.25
#2: 1 1 0.75
#3: 2 0 0.20
#4: 2 1 0.60
#5: 2 2 0.20
#6: 3 1 0.60
#7: 3 2 0.40
#8: 4 2 1.00

Resources