The original dataset contains survey data in long form
Original dataset
T Q1 Q2 Q3
M1 3 5 4
M1 3 1 3
M1 1 3 1
M2 4 4 2
M2 2 2 3
M2 5 5 5
Where T is the type of respondents and Q1--Q3 are the questions, and the cell value corresponds to their agreement level on a 1--5 Likert
scale.
Wanted dataset
T Q A1 A2 A3 A4 A5
M1 Q1 1 0 3 0 0
M2 Q1 0 1 0 1 1
M1 Q2 1 0 1 0 1
M2 Q2 0 1 0 1 1
M1 Q3 1 0 1 1 0
M2 Q3 0 1 1 0 1
Where A1--A5 are the possible answers (1--5 Likert) and the cell value contains the frequency of these answers for each group M1 and M2.
How to get from the Original dataset to the Wanted dataset?
One way would be to use the dplyr and tidyr
library(dplyr)
library(tidyr)
df <- data.frame(Type = c('M1', 'M1', 'M1', 'M2', 'M2', 'M2'),
Q1 = c(3, 3, 1, 4, 2, 5),
Q2 = c(5, 1, 3, 4, 2, 5),
Q3 = c(4, 3, 1, 2, 3, 5))
df %>%
gather(key = 'Q', value = 'A', -Type) %>%
group_by(Type, Q, A) %>%
summarize(Count = n()) %>%
mutate(A = paste0('A', A)) %>%
spread(key = A, value = Count, fill = 0) %>%
arrange(Q, Type)
I used tidyverse fuction to solve your problem. Notice that I had to create row identifiers because not always gather an spread are symmetrics (for more, check this out)
library(tidyverse)
# Data
x <- data.frame(
T = c("M1", "M1", "M1", "M2", "M2", "M2"),
Q1 = c(3, 3, 1, 4, 2, 5),
Q2 = c(5, 1, 3, 4, 2, 5),
Q3 = c(4, 3, 1, 2, 3, 5)
)
# Modification
gather(x, key, A, -T) %>%
group_by(T, key, A) %>%
mutate(row_id = 1:n()) %>%
ungroup() %>%
spread(A, A, fill = 0, sep = "") %>%
select(-row_id)
Related
How can I subtract one group of values from all values using group_by in tibble.
Below is an example with expected results. I wish to subtract values of category "A" from all values
d <- tibble(categories = c(rep("A", 3), rep("B", 3), rep("C", 3)),
values = 1:9)
# expected outcome
d <- tibble(categories = c(rep("A", 3), rep("B", 3), rep("C", 3)),
values = c(0, 0, 0, 3, 3, 3, 6, 6, 6))
If the categories size are the same length, we could do
library(dplyr)
d %>%
mutate(values = values - d$values[d$categories == "A"])
-output
# A tibble: 9 × 2
categories values
<chr> <int>
1 A 0
2 A 0
3 A 0
4 B 3
5 B 3
6 B 3
7 C 6
8 C 6
9 C 6
You can do:
library(tidyverse)
d %>%
group_by(categories) %>%
mutate(id = row_number()) %>%
ungroup() %>%
pivot_wider(names_from = 'categories',
values_from = 'values') %>%
mutate(across(-id, ~ . - A)) %>%
pivot_longer(cols = -id,
names_to = 'categories',
values_to = 'values',
cols_vary = 'slowest') %>%
select(-id)
Alternatively:
d %>%
group_by(categories) %>%
mutate(id = row_number()) %>%
ungroup() %>%
mutate(values = values - values[categories == 'A' & id == id]) %>%
select(-id)
# A tibble: 9 x 2
categories values
<chr> <int>
1 A 0
2 A 0
3 A 0
4 B 3
5 B 3
6 B 3
7 C 6
8 C 6
9 C 6
I would like to reshape the data sample below, so that to get the output like in the table. How can I reach to that? the idea is to split the column e into two columns according to the disease. Those with disease 0 in one column and those with disease 1 in the other column. thanks in advance.
structure(list(id = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), fid = c(1,
1, 2, 2, 3, 3, 4, 4, 5, 5), disease = c(0, 1, 0, 1, 1, 0, 1, 0, 0,
1), e = c(3, 2, 6, 1, 2, 5, 2, 3, 1, 1)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -10L))
library(tidyverse)
df %>%
pivot_wider(fid, names_from = disease, values_from = e, names_prefix = 'e') %>%
select(-fid)
e0 e1
<dbl> <dbl>
1 3 2
2 6 1
3 5 2
4 3 2
5 1 1
if you want the e1,e2 you could do:
df %>%
pivot_wider(fid, names_from = disease, values_from = e,
names_glue = 'e{disease + 1}') %>%
select(-fid)
# A tibble: 5 x 2
e1 e2
<dbl> <dbl>
1 3 2
2 6 1
3 5 2
4 3 2
5 1 1
We could use lead() combined with ìfelse statements for this:
library(dplyr)
df %>%
mutate(e2 = lead(e)) %>%
filter(row_number() %% 2 == 1) %>%
mutate(e1 = ifelse(disease==1, e2,e),
e2 = ifelse(disease==0, e2,e)) %>%
select(e1, e2)
e1 e2
<dbl> <dbl>
1 3 2
2 6 1
3 5 2
4 3 2
5 1 1
I am trying to get the list of sums of two columns from my original data set, from left to right
I have made a loop:
for (i in 1:ncol(df)) {
m = i
n = i + 1
if (i %% 2 != 0) {
df_cum$V1 <- sum(df[,m] + df[,n])
}
}
But, the way to add value to the new list is wrong:
df_cum$V1 <- sum(df[,m] + df[,n])
would be really appreciated if anyone knows how to do that in R
You can try split.default(), i.e.
sapply(split.default(df, gsub('\\d+', '', names(df))), sum)
A B
17 12
A base R option using tapply -
tapply(unlist(df),
rep(1:ncol(df), each = nrow(df) * 2, length.out = nrow(df) * ncol(df)),
sum)
# 1 2 3
#17 12 13
The logic here is to create group of every 2 columns and sum them.
data
It is easier to help if you provide data in a reproducible format
df <- data.frame(A1 = c(0, 3, 2), A2 = c(2, 6, 4),
B1 = c(3, 0, 1), B2 = c(2, 3, 3),
C1 = c(7, 3, 2), C2 = c(1, 0, 0))
We can do this in tidyverse
library(dplyr)
library(tidyr)
df1 %>%
pivot_longer(everything(), names_to = c(".value", "grp"),
names_sep ="(?<=[A-Z])(?=[0-9])") %>%
select(-grp) %>%
summarise(across(everything(), sum, na.rm = TRUE), .groups = 'drop')
-output
# A tibble: 1 x 3
A B C
<dbl> <dbl> <dbl>
1 17 12 13
Or using base R
aggregate(values ~ ., transform(stack(df1),
ind = sub("\\d+", "", ind)), FUN = sum)
ind values
1 A 17
2 B 12
3 C 13
Or another option with rowsum from base R
with(stack(df1), rowsum(values, group = trimws(ind, whitespace = "\\d+")))
[,1]
A 17
B 12
C 13
Or another option is with colSums and rowsum
{tmp <- colSums(df1); rowsum(tmp, group = substr(names(tmp), 1, 1))}
[,1]
A 17
B 12
C 13
data
df1 <- structure(list(A1 = c(0, 3, 2), A2 = c(2, 6, 4), B1 = c(3, 0,
1), B2 = c(2, 3, 3), C1 = c(7, 3, 2), C2 = c(1, 0, 0)),
class = "data.frame", row.names = c(NA,
-3L))
How might I calculate the delta between multiple variables grouped by user ids in a "long" data frame?
Data format:
d1 <- data.frame(
id = rep(c(1, 2, 3, 4, 5), each = 2),
purchased = c(rep(c(T, F), 3), F, T, T, F),
product = rep(c("A", "B"), 5),
grade = c(1, 2, 1, 2, 2, 3, 7, 5, 1, 2),
rate = c(10, 12, 10, 12, 12, 14, 22, 18, 10, 12),
fee = rep(c(1, 2), 5))
This is my roundabout solution:
dA <- d1 %>%
filter(product == "A")
dB <- d1 %>%
filter(product == "B")
d2 <- inner_join(dA, dB, by = "id", suffix = c(".A", ".B"))
d3 <- d2 %>%
mutate(
purchased = if_else(purchased.A == T, "A", "B"),
dGrade = grade.B - grade.A,
dRate = rate.B - rate.A,
dFee = fee.B - fee.A) %>%
select(id, purchased:dFee)
All of this just seems terribly inefficient and complex. Is tidyr::spread or another dplyr/tidyr function appropriate here? (I couldn't get anything else to work)...
We can do this with gather/spread. Reshape the data from 'wide' to 'long' using gather, grouped by 'id', 'Var', we get the 'product' based on the logical column 'purchased', get the difference of 'Val' for 'product' that are 'B' and 'A', and spread it from 'long' to 'wide' format.
library(dplyr)
library(tidyr)
gather(d1, Var, Val, grade:fee) %>%
group_by(id, Var) %>%
summarise(purchased = product[purchased],
Val = Val[product == 'B'] - Val[product == 'A'])%>%
spread(Var, Val)
# id purchased fee grade rate
# <dbl> <fctr> <dbl> <dbl> <dbl>
#1 1 A 1 1 2
#2 2 A 1 1 2
#3 3 A 1 1 2
#4 4 B 1 -2 -4
#5 5 A 1 1 2
The OP's output ('d3') is
d3
# id purchased dGrade dRate dFee
#1 1 A 1 2 1
#2 2 A 1 2 1
#3 3 A 1 2 1
#4 4 B -2 -4 1
#5 5 A 1 2 1
I am always unsure how to retrieve a summary with dplyr.
Let us suppose I have a summary of individuals and households.
dta = rbind(c(1, 1, 45),
c(1, 2, 47),
c(2, 1, 24),
c(2, 2, 26),
c(3, 1, 67),
c(4, 1, 20),
c(4, 2, 21),
c(5, 3, 7)
)
dta = as.data.frame(dta)
colnames(dta) = c('householdid', 'id', 'age')
householdid id age
1 1 45
1 2 47
2 1 24
2 2 26
3 1 67
4 1 20
4 2 21
4 3 7
Imagine I want to calculate the number of person in the household and the mean age by households and then re-use this information in the original dataset.
dta %>%
group_by(householdid) %>%
summarise( nhouse = n(), meanAgeHouse = mean(age) ) %>%
merge(., dta, all = T)
I am often using merge, but it is slow sometimes when the dataset is huge.
Is it possible to
mutate
instead of
merge ?
dta %>% group_by(householdid) %>% mutate( nhouse = n(), meanAgeHouse = mean(age) )