Creating multiple proportion table in R dataframe - r

I've the following table
Result_Group
Review
A
1
B
4
A
1
C
1
D
5
D
4
E
5
C
1
C
2
A
2
B
3
E
2
df = structure(list(Result_Group = structure(c(1L, 2L, 1L, 3L, 4L, 4L, 5L, 3L, 3L, 1L, 2L, 5L), .Label = c("A", "B", "C", "D", "E"
), class = "factor"), Review = c(1L, 4L, 1L, 1L, 5L, 4L, 5L, 1L, 2L, 2L, 3L, 2L)),
class = "data.frame", row.names = c(NA, -12L))
Does anyone know how can create a table for the proportion of the review for each group? Currently I'm doing it group by group and it's taking quite a while just to subset the data.
i.e. the table as follows:
Review
A
B
C
D
E
1
0.67
0
0.67
0
0
2
0.33
0
0.33
0
0.50
3
0
0.50
0
0
0
4
0
0.50
0
0.5
0
5
0
0
0
0.5
0.50
Thanks!

You can do:
library(tidyverse)
df |>
group_by(Result_Group) |>
count(Review) |>
mutate(prop = n/sum(n)) |>
ungroup() |>
select(-n) |>
pivot_wider(names_from = Result_Group,
values_from = prop,
values_fill = 0)
# A tibble: 5 x 6
Review A B C D E
<int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 0.667 0 0.667 0 0
2 2 0.333 0 0.333 0 0.5
3 3 0 0.5 0 0 0
4 4 0 0.5 0 0.5 0
5 5 0 0 0 0.5 0.5

Here is a tidy approach using dplyr and tidyr
library(dplyr)
df %>%
# Add count values (all equal to 1)
mutate(count = 1) %>%
# Pivot wider to get A, B, C.. as column names, and sum of count as values
tidyr::pivot_wider(
id_cols = Review,
names_from = Result_Group,
values_from = count,
values_fn = sum,
values_fill = 0 # NAs are turned into 0
) %>%
# Mutate to get fractions instead of count
mutate(
across(
-Review,
~ .x / sum(.x)
)
) %>%
# Sort by review
arrange(Review)
#> # A tibble: 5 × 6
#> Review A B C D E
#> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 0.667 0 0.667 0 0
#> 2 2 0.333 0 0.333 0 0.5
#> 3 3 0 0.5 0 0 0
#> 4 4 0 0.5 0 0.5 0
#> 5 5 0 0 0 0.5 0.5
Created on 2022-03-22 by the reprex package (v2.0.1)

Related

R: Recursive Averages

I am working with the R programming language. I have the following data:
library(dplyr)
my_data = data.frame(id = c(1,1,1,1,2,2,2,3,4,4,5,5,5,5,5), var_1 = sample(c(0,1), 15, replace = TRUE) , var_2 =sample(c(0,1), 15 , replace = TRUE) )
my_data = data.frame(my_data %>% group_by(id) %>% mutate(index = row_number(id)))
my_data = my_data[,c(1,4,2,3)]
The data looks something like this:
id index var_1 var_2
1 1 1 0 1
2 1 2 0 0
3 1 3 1 1
4 1 4 0 1
5 2 1 1 0
6 2 2 1 1
7 2 3 0 1
8 3 1 1 0
9 4 1 0 0
10 4 2 0 0
11 5 1 0 0
12 5 2 1 0
13 5 3 0 1
14 5 4 0 0
15 5 5 0 1
I want to create two new variables (v_1, v_2). For each unique "id":
v_1: I want v_1 to be the average value of the current, previous and previous-to-previous values of var_1 (i.e. index = n, index = n-1 and index = n-2). When this is not possible (e.g. for index = 2 and index = 1), I want this average to be for as back as you can go.
v_2: I want v_2 to be the average value of the current, previous and previous-to-previous values of var_2 (i.e. index = n, index = n-1 and index = n-2). When this is not possible (e.g. for index = 2 and index = 1), I want this average to be for as back as you can go.
This would be something like this:
row 1 (id = 1, index = 1) : v_1 = var_1 (index 1)
row 2 (id = 1, index = 1 ; id = 1 index = 2) : v_1 = (var_1 (index 1) + var_1 (index 2))/2
row 3 (id = 1, index = 1 ; id = 1 index = 2; id = 1, index = 3) : v_1 = (var_1 (index 1) + var_1 (index 2) + var_1 (index 3)) /3
row 4 (id = 1, index = 2 ; id = 1 index = 3; id = 1, index = 4) : v_1 = (var_1 (index 2) + var_1 (index 3) + var_1 (index 4)) /3
etc.
I tried to do this with the following code:
average_data = my_data %>%
group_by(id) %>%
summarise(v_1 = mean(tail(var_1, 3)),
v_2 = mean(tail(var_2, 3)))
# final_result
final_data = merge(x = my_data, y = average_data, by = "id", all.x = TRUE)
But I am not sure if this is correct.
Can someone please show me how to do this?
Thanks!
data
df <- data.frame(
id = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 4L, 4L, 5L, 5L, 5L, 5L, 5L),
index = c(1L, 2L, 3L, 4L, 1L, 2L, 3L, 1L, 1L, 2L, 1L, 2L, 3L, 4L, 5L),
var_1 = c(0L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L),
var_2 = c(1L, 0L, 1L, 1L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L)
)
tidyverse
library(tidyverse)
df %>%
group_by(id) %>%
mutate(across(starts_with("var_"),
.fns = ~zoo::rollapply(data = .x, width = 3, FUN = mean, partial = TRUE, align = "right"),
.names = "new_{.col}")) %>%
ungroup()
#> # A tibble: 15 × 6
#> id index var_1 var_2 new_var_1 new_var_2
#> <int> <int> <int> <int> <dbl> <dbl>
#> 1 1 1 0 1 0 1
#> 2 1 2 0 0 0 0.5
#> 3 1 3 1 1 0.333 0.667
#> 4 1 4 0 1 0.333 0.667
#> 5 2 1 1 0 1 0
#> 6 2 2 1 1 1 0.5
#> 7 2 3 0 1 0.667 0.667
#> 8 3 1 1 0 1 0
#> 9 4 1 0 0 0 0
#> 10 4 2 0 0 0 0
#> 11 5 1 0 0 0 0
#> 12 5 2 1 0 0.5 0
#> 13 5 3 0 1 0.333 0.333
#> 14 5 4 0 0 0.333 0.333
#> 15 5 5 0 1 0 0.667
Created on 2022-06-06 by the reprex package (v2.0.1)
data.table
library(data.table)
COLS <- gsub("ar", "", grep("var_", names(df), value = TRUE))
setDT(df)[,
(COLS) := lapply(.SD, function(x) zoo::rollapply(data = x, width = 3, FUN = mean, partial = TRUE, align = "right")),
by = id,
.SDcols = patterns("var_")][]
#> id index var_1 var_2 v_1 v_2
#> 1: 1 1 0 1 0.0000000 1.0000000
#> 2: 1 2 0 0 0.0000000 0.5000000
#> 3: 1 3 1 1 0.3333333 0.6666667
#> 4: 1 4 0 1 0.3333333 0.6666667
#> 5: 2 1 1 0 1.0000000 0.0000000
#> 6: 2 2 1 1 1.0000000 0.5000000
#> 7: 2 3 0 1 0.6666667 0.6666667
#> 8: 3 1 1 0 1.0000000 0.0000000
#> 9: 4 1 0 0 0.0000000 0.0000000
#> 10: 4 2 0 0 0.0000000 0.0000000
#> 11: 5 1 0 0 0.0000000 0.0000000
#> 12: 5 2 1 0 0.5000000 0.0000000
#> 13: 5 3 0 1 0.3333333 0.3333333
#> 14: 5 4 0 0 0.3333333 0.3333333
#> 15: 5 5 0 1 0.0000000 0.6666667
Created on 2022-06-06 by the reprex package (v2.0.1)
I would say this is moving average, and it can be impemented by a function f like below, using embed (preferrable) or sapply (less efficient, not recommanded), and run it group-wisely using ave:
f <- function(v, n = 3) {
rowMeans(embed(c(rep(NA, n-1), v), n), na.rm = TRUE)
}
or
f <- function(v, n = 3) {
sapply(
seq_along(v),
function(k) sum(v[pmax(k - n + 1, 1):k]) / pmin(k, n)
)
}
and then we run
transform(
df,
v1 = ave(var_1, id, FUN = f),
v2 = ave(var_2, id, FUN = f)
)
such that
id index var_1 var_2 v1 v2
1 1 1 0 1 0.0000000 1.0000000
2 1 2 0 0 0.0000000 0.5000000
3 1 3 1 1 0.3333333 0.6666667
4 1 4 0 1 0.3333333 0.6666667
5 2 1 1 0 1.0000000 0.0000000
6 2 2 1 1 1.0000000 0.5000000
7 2 3 0 1 0.6666667 0.6666667
8 3 1 1 0 1.0000000 0.0000000
9 4 1 0 0 0.0000000 0.0000000
10 4 2 0 0 0.0000000 0.0000000
11 5 1 0 0 0.0000000 0.0000000
12 5 2 1 0 0.5000000 0.0000000
13 5 3 0 1 0.3333333 0.3333333
14 5 4 0 0 0.3333333 0.3333333
15 5 5 0 1 0.0000000 0.6666667
You could create a function that acomplishes this:
library(tidyverse)
fun <- function(x, k){
y <- cummean(first(x, k-1))
if(k > length(x)) y else c(y, zoo::rollmean(x, k))
}
df %>%
group_by(id) %>%
mutate(v_1 = fun(var_1, 3), v_2 = fun(var_2, 3))
# Groups: id [5]
id index var_1 var_2 v_1 v_2
<int> <int> <int> <int> <dbl> <dbl>
1 1 1 0 1 0 1
2 1 2 0 0 0 0.5
3 1 3 1 1 0.333 0.667
4 1 4 0 1 0.333 0.667
5 2 1 1 0 1 0
6 2 2 1 1 1 0.5
7 2 3 0 1 0.667 0.667
8 3 1 1 0 1 0
9 4 1 0 0 0 0
10 4 2 0 0 0 0
11 5 1 0 0 0 0
12 5 2 1 0 0.5 0
13 5 3 0 1 0.333 0.333
14 5 4 0 0 0.333 0.333
15 5 5 0 1 0 0.667
Here is a solution using only built in functions and dplyr:
my_data %>%
mutate(
row = seq_along(id),
v_1 = (var_1 + lag(var_1, default = 0) + lag(var_1, 2, default = 0))/pmin(row, 3),
v_2 = (var_2 + lag(var_2, default = 0) + lag(var_2, 2, default = 0))/pmin(row, 3)
)
#> id index var_1 var_2 row v_1 v_2
#> 1 1 1 0 1 1 0.0000000 1.0000000
#> 2 1 2 1 0 2 0.5000000 0.5000000
#> 3 1 3 1 1 3 0.6666667 0.6666667
#> 4 1 4 1 0 4 1.0000000 0.3333333
#> 5 2 1 0 1 5 0.6666667 0.6666667
#> 6 2 2 0 1 6 0.3333333 0.6666667
#> 7 2 3 1 1 7 0.3333333 1.0000000
#> 8 3 1 1 1 8 0.6666667 1.0000000
#> 9 4 1 1 1 9 1.0000000 1.0000000
#> 10 4 2 1 1 10 1.0000000 1.0000000
#> 11 5 1 0 1 11 0.6666667 1.0000000
#> 12 5 2 0 1 12 0.3333333 1.0000000
#> 13 5 3 0 0 13 0.0000000 0.6666667
#> 14 5 4 0 0 14 0.0000000 0.3333333
#> 15 5 5 1 0 15 0.3333333 0.0000000
Created on 2022-06-09 by the reprex package (v2.0.1)
dplyr::lag() gives you the previuous values of your variable. If they don't exist, we swap them for 0, basically ignoring them. To get the average we divide on pmax(seq_along(<any variable>, 3)), which will be 1 for the first row, 2 for the second, and 3 for all other rows.
This will also work on a grouped dataframe.
You can use filter (which is hidden when loading dplyr) or convolve and ave for grouping.
fun <- function(x) {
. <- if(length(x) > 2) stats::filter(x, c(1,1,1)/3, side=1)[-2:-1] else NULL
#. <- if(length(x) > 2) convolve(x, c(1,1,1)/3, , type = "filter") else NULL #Alternative
c(cummean(x[1:min(2, length(x))]), .)
}
my_data$v_1 <- ave(my_data$var_1, my_data$id, FUN=fun)
my_data$v_2 <- ave(my_data$var_2, my_data$id, FUN=fun)
my_data
# id index var_1 var_2 v_1 v_2
#1 1 1 1 1 1.0000000 1.0000000
#2 1 2 1 1 1.0000000 1.0000000
#3 1 3 0 1 0.6666667 1.0000000
#4 1 4 1 1 0.6666667 1.0000000
#5 2 1 0 1 0.0000000 1.0000000
#6 2 2 0 0 0.0000000 0.5000000
#7 2 3 1 0 0.3333333 0.3333333
#8 3 1 0 0 0.0000000 0.0000000
#9 4 1 0 1 0.0000000 1.0000000
#10 4 2 0 0 0.0000000 0.5000000
#11 5 1 1 0 1.0000000 0.0000000
#12 5 2 0 1 0.5000000 0.5000000
#13 5 3 0 0 0.3333333 0.3333333
#14 5 4 1 0 0.3333333 0.3333333
#15 5 5 0 1 0.3333333 0.3333333
Or using cumsum:
fun2 <- function(x, n=3) {
(cumsum(x) - head(cumsum(c(rep(0, n), x)), -n)) / pmin(n, seq_along(x)) }
my_data$v_1 <- ave(my_data$var_1, my_data$id, FUN=fun2)
my_data$v_2 <- ave(my_data$var_2, my_data$id, FUN=fun2)
Here is a try with a simple function avg to return this type of average
library(dplyr , warn.conflicts = FALSE)
set.seed(1978)
my_data = data.frame(id = c(1,1,1,1,2,2,2,3,4,4,5,5,5,5,5), var_1 = sample(c(0,1), 15, replace = TRUE) , var_2 =sample(c(0,1), 15 , replace = TRUE) )
my_data = data.frame(my_data %>% group_by(id) %>% mutate(index = row_number(id)))
my_data = my_data[,c(1,4,2,3)]
#===================================
avg <- function(x){
t <- rep(c(T,NA) , c(3 , length(x) - 1))
m <- numeric(length(x))
for(i in 1:length(x)){
m [i]<- mean(x[t[3:length(t)]] , na.rm = TRUE)
t <- lag(t)
}
m
}
#===================================
library(tidyverse)
my_data %>%
group_by(id) %>%
mutate(v_1 = avg(var_1), v_2 = avg(var_2))
#> # A tibble: 15 × 6
#> # Groups: id [5]
#> id index var_1 var_2 v_1 v_2
#> <dbl> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 0 0 0 0
#> 2 1 2 1 0 0.5 0
#> 3 1 3 1 0 0.667 0
#> 4 1 4 1 1 1 0.333
#> 5 2 1 0 1 0 1
#> 6 2 2 1 1 0.5 1
#> 7 2 3 0 0 0.333 0.667
#> 8 3 1 1 0 1 0
#> 9 4 1 1 1 1 1
#> 10 4 2 0 1 0.5 1
#> 11 5 1 1 1 1 1
#> 12 5 2 1 0 1 0.5
#> 13 5 3 0 1 0.667 0.667
#> 14 5 4 1 0 0.667 0.333
#> 15 5 5 1 0 0.667 0.333
Created on 2022-06-09 by the reprex package (v2.0.1)
This uses dplyr's across with slider's slide_dbl; both from the tidyverse. Slider handles partial windows, so is well-suited to this problem.
(%>% may be used instead of the native pipe |>.)
library(dplyr)
library(slider)
# Sample Data
df <- data.frame(
id = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 4L, 4L, 5L, 5L, 5L, 5L, 5L),
index = c(1L, 2L, 3L, 4L, 1L, 2L, 3L, 1L, 1L, 2L, 1L, 2L, 3L, 4L, 5L),
var_1 = c(0L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L),
var_2 = c(1L, 0L, 1L, 1L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L)
)
# Possible answer
df |>
group_by(id) |>
mutate(across(starts_with("var_"), ~ slide_dbl(., mean, .before = 2), .names = "{.col}_mean")) |>
ungroup()
#> # A tibble: 15 × 6
#> id index var_1 var_2 var_1_mean var_2_mean
#> <int> <int> <int> <int> <dbl> <dbl>
#> 1 1 1 0 1 0 1
#> 2 1 2 0 0 0 0.5
#> 3 1 3 1 1 0.333 0.667
#> 4 1 4 0 1 0.333 0.667
#> 5 2 1 1 0 1 0
#> 6 2 2 1 1 1 0.5
#> 7 2 3 0 1 0.667 0.667
#> 8 3 1 1 0 1 0
#> 9 4 1 0 0 0 0
#> 10 4 2 0 0 0 0
#> 11 5 1 0 0 0 0
#> 12 5 2 1 0 0.5 0
#> 13 5 3 0 1 0.333 0.333
#> 14 5 4 0 0 0.333 0.333
#> 15 5 5 0 1 0 0.667
Created on 2022-06-12 by the reprex package (v2.0.1)

Add frequency into dataframe for each group and unique element (R)

I have a table such as
Group Family Nb
1 A 15
2 B 20
3 A 2
3 B 1
3 C 1
4 D 10
4 A 5
5 B 1
5 D 1
And I would like to transform that dataframe such that I have each unique Family element in columns, and for each Group, the frequency of the Nb element, I should then get :
Group A B C D E F
1 1 0 0 0 0 0
2 0 1 0 0 0 0
3 0.5 0.25 0.25 0 0 0
4 0.33 0 0 0.67 0 0
5 0 0.5 0 0.5 0 0
Here is the dput format of the tabel if it can helps :
Family = c("A", "B", "A", "B", "C", "D", "A", "B", "D"),
Nb = c(15L, 20L, 2L, 1L, 1L, 10L, 5L, 1L, 1L)), class = "data.frame", row.names = c(NA,
-9L))
in Base R:
prop.table(xtabs(Nb ~ ., df), 1)
# Family
#Group A B C D
# 1 1.0000000 0.0000000 0.0000000 0.0000000
# 2 0.0000000 1.0000000 0.0000000 0.0000000
# 3 0.5000000 0.2500000 0.2500000 0.0000000
# 4 0.3333333 0.0000000 0.0000000 0.6666667
# 5 0.0000000 0.5000000 0.0000000 0.5000000
If you need it as a data.frame, just wrap the results in as.data.frame.matrix
You can first group_by the Group column, then calculate the frequency and finally pivot the data into a "wide" format.
library(tidyverse)
df %>%
group_by(Group) %>%
mutate(Nb = Nb/sum(Nb)) %>%
pivot_wider(Group, names_from = "Family", values_from = "Nb", values_fill = 0)
# A tibble: 5 × 5
# Groups: Group [5]
Group A B C D
<int> <dbl> <dbl> <dbl> <dbl>
1 1 1 0 0 0
2 2 0 1 0 0
3 3 0.5 0.25 0.25 0
4 4 0.333 0 0 0.667
5 5 0 0.5 0 0.5
Another possible solution:
library(tidyverse)
df %>%
pivot_wider(names_from = Family, values_from = Nb, values_fill = 0) %>%
mutate(aux = rowSums(.[-1]), across(-Group, ~ .x / aux), aux = NULL)
#> # A tibble: 5 × 5
#> Group A B C D
#> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 0 0 0
#> 2 2 0 1 0 0
#> 3 3 0.5 0.25 0.25 0
#> 4 4 0.333 0 0 0.667
#> 5 5 0 0.5 0 0.5

Convert value of matrix to 1 or 0 based on the conditions of the second matrix

I have two matrices you can see below:
The first matrix is my actual data collection field. just last column (Group) is the number of group classification of a sample (for example samples 1 & 3 belong to group 1), and other columns are the value of species. I want to convert the value of species in each sample to number one if it has the conditions of the second matrix and if it does not have the conditions of the second matrix convert to zero.
matrix (1)
A B C D Group
1 5 1 6 1 1
2 4 4 5 8 2
3 1 4 3 4 1
4 0 2 7 5 3
5 8 4 3 1 3
matrix (2)
Group1 Group2 Group3
A 1 0 0
B 1 1 0
C 0 0 1
D 1 1 1
The new matrix I want to have is as follows
new matrix
A B C D
1 1 1 0 1
2 0 1 0 1
3 1 1 0 1
4 0 0 1 1
5 0 0 1 1
To better understand the new matrix, let me give an example:
For example, since species A is present only in group 1 (1 is present and 0 is absent in matrix (2)), its value became 1 in samples 1 and 3 and 0 in other samples, or for species D, because it was present in all groups, its value for all samples was one.
Here is another approach:
df2mask <- t(df2)[df1$Group, ]
df2mask
# A B C D
# Group1 1 1 0 1
# Group2 0 1 0 1
# Group1 1 1 0 1
# Group3 0 0 1 1
# Group3 0 0 1 1
dfnew <- ifelse(df1[, -5] * df2mask > 0, 1, 0)
dfnew
# A B C D
# 1 1 1 0 1
# 2 0 1 0 1
# 3 1 1 0 1
# 4 0 0 1 1
# 5 0 0 1 1
Note that the mask and the final matrix are identical because your example does not contain any cell where the species is absent but the mask value is 1.
df1%>%
rownames_to_column('ID')%>%
pivot_longer(-c(ID, Group), names_to = 'rn', values_to = 'val1') %>%
left_join(df2%>%
rownames_to_column('rn') %>%
pivot_longer(-rn, names_pattern = '(\\d)$',
names_to = 'Group',
names_transform = list(Group = as.integer)))%>%
pivot_wider(ID, names_from = 'rn', values_from = 'value')
# A tibble: 5 x 5
ID A B C D
<chr> <int> <int> <int> <int>
1 1 1 1 0 1
2 2 0 1 0 1
3 3 1 1 0 1
4 4 0 0 1 1
5 5 0 0 1 1
data
df1 <- structure(list(A = c(5L, 4L, 1L, 0L, 8L), B = c(1L, 4L, 4L, 2L,
4L), C = c(6L, 5L, 3L, 7L, 3L), D = c(1L, 8L, 4L, 5L, 1L), Group = c(1L,
2L, 1L, 3L, 3L)), class = "data.frame", row.names = c("1", "2",
"3", "4", "5"))
df2 <- structure(list(Group1 = c(1L, 1L, 0L, 1L), Group2 = c(0L, 1L,
0L, 1L), Group3 = c(0L, 0L, 1L, 1L)), class = "data.frame", row.names = c("A",
"B", "C", "D"))

Mutate with str_detect only if condition is true

Im trying to use str_detect to mutate only if the column "RedColor" is "1".
I have a dataset test which looks like this:
# id RedColor Color_Number
#1 1 1 1
#2 2 0 1
#3 3 1 3
#4 4 1 2
#5 6 0 2
#6 8 1 6
I tried the filter function but it returns me only a filtered dataset with all other cases with RedColor = "0" removed.
test <- test %>%
filter(RedColor==TRUE) %>%
mutate(DarkRed = str_detect(Color_Number, "1|2"))
Im expecting an output with the new column DarkRed = "1" in all cases with RedColor = 1 and 1 or 2 in column Color_Number.
# id RedColor Color_Number Dark_Red
#1 1 1 1 1
#2 2 0 1 0
#3 3 1 3 0
#4 4 1 2 1
#5 6 0 2 0
#6 8 1 6 0
Thank you!
Using base R
transform(df, Dark_Red = +(RedColor == 1& Color_Number %in% 1:2))
id RedColor Color_Number Dark_Red
1 1 1 1 1
2 2 0 1 0
3 3 1 3 0
4 4 1 2 1
5 6 0 2 0
6 8 1 6 0
data
df <- structure(list(id = c(1L, 2L, 3L, 4L, 6L, 8L), RedColor = c(1L,
0L, 1L, 1L, 0L, 1L), Color_Number = c(1L, 1L, 3L, 2L, 2L, 6L)), row.names = c(NA,
-6L), class = "data.frame")
Update on OP's request (see comments):
With this dataframe:
id RedColor Color_Number
1 1 1 one
2 2 0 one
3 3 1 three
4 4 1 two
5 6 0 two
6 8 1 six
you could use this code:
library(dplyr)
df %>%
mutate(Dark_Red = ifelse(
RedColor == 1 & Color_Number == "one" | Color_Number == "two", 1, 0))
Output:
id RedColor Color_Number Dark_Red
1 1 1 one 1
2 2 0 one 0
3 3 1 three 0
4 4 1 two 1
5 6 0 two 1
6 8 1 six 0
First answer:
We could use ifelse
str_detect is not appropriate as Ronak already explained:
library(dplyr)
df %>%
mutate(Dark_Red = ifelse(
RedColor == 1 & Color_Number == 1 | Color_Number == 2, 1, 0))
Output:
id RedColor Color_Number Dark_Red
1 1 1 1 1
2 2 0 1 0
3 3 1 3 0
4 4 1 2 1
5 6 0 2 1
6 8 1 6 0
For exact matches don't perform regex match. str_detect is used for pattern matching. Use %in% to match multiple values.
library(dplyr)
df <- df %>% mutate(Dark_Red = as.integer(RedColor == 1 & Color_Number %in% 1:2))
df
# id RedColor Color_Number Dark_Red
#1 1 1 1 1
#2 2 0 1 0
#3 3 1 3 0
#4 4 1 2 1
#5 6 0 2 0
#6 8 1 6 0
If you want to write this in base R use transform -
df <- transform(df, Dark_Red = as.integer(RedColor == 1 & Color_Number %in% 1:2))
data
df <- structure(list(id = c(1L, 2L, 3L, 4L, 6L, 8L), RedColor = c(1L,
0L, 1L, 1L, 0L, 1L), Color_Number = c(1L, 1L, 3L, 2L, 2L, 6L)),
row.names = c(NA, -6L), class = "data.frame")
you can use ifelse inside the mutate call instead of filtering:
test <- test %>%
mutate(Darkred=ifelse((RedColor==TRUE & Color_Number %in% 1:2), 1,0))
> test
# A tibble: 10 × 4
id RedColor Color_Number Darkred
<int> <int> <int> <dbl>
1 1 1 2 1
2 2 1 2 1
3 3 1 3 0
4 4 1 3 0
5 5 0 4 0
6 6 0 2 0
7 7 1 3 0
8 8 1 4 0
9 9 0 5 0
10 10 0 3 0
Data:
test<-data_frame(id=1:10,
RedColor=rbinom(10,1,0.5),
Color_Number=sample(1:5,10,TRUE,rep(.2,5)))

transform a dataframe in R, re organize according to one variable

I have a dataframe called data1 of this type:
individual location type
1 site1 1
2 site2 3
3 site3 2
4 site4 3
5 site5 1
6 site2 4
7 site5 2
I would like to transform the data above into rows (observations) for distinct sites, and count the number of types for each site, and obtain the following:
location type1 type2 type3 type4
site1 1 0 0 0
site2 0 0 1 1
site3 0 1 0 0
site4 0 0 1 0
site5 2 1 0 0
how do I create a dataset with the format above ?
(the above is simplified, my original data are 2500 individuals and 400 locations)
Try this tidyverse approach which is close to want you want. You can reshape your data to long, obtain the values for your dataframe and then reshape to wide to obtain the expected result. Here the code:
library(tidyverse)
#Code
df %>%
#Mutate type
mutate(type=paste0('type',type)) %>%
#Mutate number of types
group_by(location,type) %>%
mutate(N=n()) %>% select(-individual) %>%
pivot_wider(names_from = type,values_from=N) %>%
replace(is.na(.),0) %>% select(sort(current_vars()))
Output:
# A tibble: 5 x 5
# Groups: location [5]
location type1 type2 type3 type4
<chr> <int> <int> <int> <int>
1 site1 1 0 0 0
2 site2 0 0 1 1
3 site3 0 1 0 0
4 site4 0 0 1 0
5 site5 1 1 0 0
Some data used:
#Data
df <- structure(list(individual = 1:7, location = c("site1", "site2",
"site3", "site4", "site5", "site2", "site5"), type = c(1L, 3L,
2L, 3L, 1L, 4L, 2L)), class = "data.frame", row.names = c(NA,
-7L))
In base R, we can just use table
table(df[-1])
# type
#location 1 2 3 4
# site1 1 0 0 0
# site2 0 0 1 1
# site3 0 1 0 0
# site4 0 0 1 0
# site5 1 1 0 0
Or using pivot_wider with values_fn
library(dplyr)
library(tidyr)
df %>%
select(-individual) %>%
pivot_wider(names_from = type, values_from = type, values_fn = length,
values_fill = 0)
# A tibble: 5 x 5
# location `1` `3` `2` `4`
# <chr> <int> <int> <int> <int>
#1 site1 1 0 0 0
#2 site2 0 1 0 1
#3 site3 0 0 1 0
#4 site4 0 1 0 0
#5 site5 1 0 1 0
data
df <- structure(list(individual = 1:7, location = c("site1", "site2",
"site3", "site4", "site5", "site2", "site5"), type = c(1L, 3L,
2L, 3L, 1L, 4L, 2L)), class = "data.frame", row.names = c(NA,
-7L))

Resources