I work with R, and I have a table xy like this
View( xy)
X Y
21 A
33 B
24 B
16 A
25 B
31 A
17 B
14 A
Now, I want to make groups of x and y and frequencies in steps of 10 like this at the end
Class A B
I (1-10) 0 0
II (11-20) 2 1
III (21-30) 1 2
And so on
First create the labels using either the commented out hard coded labels or the computed labels lab. Then use cut and table to create the resulting table.
# lab <- c("I (1-10)", "II (11-20)", "III (21-30)", "IV (31-40)")
n <- ceiling(max(DF$X) / 10) # 4
bounds <- seq(0, 10*n, 10) # c(0, 10, 20, 30, 40)
lab <- sprintf("%s (%d-%d)", as.roman(1:n), head(bounds, -1) + 1, bounds[-1])
Class <- cut(DF$X, bounds, lab = lab)
table(Class, Y = DF$Y)
giving:
Y
Class A B
I (1-10) 0 0
II (11-20) 2 1
III (21-30) 1 2
IV (31-40) 1 1
Note
We assume the input data frame DF is the following shown in reproducible form:
Lines <- "
X Y
21 A
33 B
24 B
16 A
25 B
31 A
17 B
14 A"
DF <- read.table(text = Lines, header = TRUE)
One tidyverse possibility could be:
df %>%
mutate(Class = X %/% 10) %>%
count(Y, Class) %>%
group_by(Y) %>%
complete(Class = seq(0, max(Class), 1)) %>%
spread(Y, n, fill = 0)
Class A B
<dbl> <dbl> <dbl>
1 0 0 0
2 1 2 1
3 2 1 2
4 3 1 1
Or if you want also the ranges:
df %>%
mutate(Class = X %/% 10) %>%
count(Y, Class) %>%
group_by(Y) %>%
complete(Class = seq(0, max(Class), 1)) %>%
spread(Y, n, fill = 0) %>%
mutate(Class = paste(Class * 10 + 1,
lead(Class * 10, default = ((last(Class) + 1) * 10)),
sep = "-"))
Class A B
<chr> <dbl> <dbl>
1 1-10 0 0
2 11-20 2 1
3 21-30 1 2
4 31-40 1 1
Or if you want the exact output you provided:
df %>%
mutate(Class = X %/% 10) %>%
count(Y, Class) %>%
group_by(Y) %>%
complete(Class = seq(0, max(Class), 1)) %>%
spread(Y, n, fill = 0) %>%
mutate(Class = paste0("(",
Class * 10 + 1,
"-",
lead(Class * 10, default = ((last(Class) + 1) * 10)),
")"),
Class = paste(as.roman(row_number()), Class, sep = " "))
Class A B
<chr> <dbl> <dbl>
1 I (1-10) 0 0
2 II (11-20) 2 1
3 III (21-30) 1 2
4 IV (31-40) 1 1
Or a possibility for the cases when X == 0:
df %>%
filter(X > 0) %>%
mutate(Class = X %/% 10) %>%
count(Y, Class) %>%
group_by(Y) %>%
complete(Class = seq(0, max(Class), 1)) %>%
spread(Y, n, fill = 0) %>%
mutate(Class = paste0("(",
Class * 10 + 1,
"-",
lead(Class * 10, default = ((last(Class) + 1) * 10)),
")"),
Class = paste(as.roman(row_number()), Class, sep = " "))
Related
Consider two tibbles data and key, given here:
library(tidyverse) # v1.3.2
set.seed(123)
data <- tibble(id = rep(LETTERS[1:10], each = 10),
position = rep(1:10, 10),
zip = sample(letters, 100, replace = T),
zap = sample(letters, 100, replace = T),
zop = sample(letters, 100, replace = T))
# A tibble: 100 × 5
id position zip zap zop
<chr> <int> <chr> <chr> <chr>
1 A 1 l n u
2 A 2 y f h
3 A 3 n y u
4 A 4 c h g
5 A 5 n l t
6 A 6 g z r
7 A 7 c d q
8 A 8 w m a
9 A 9 v n b
10 A 10 z u q
# … with 90 more rows
key <- tibble(id = c("A","D","H"),
start = c(2, 5, 7),
end = c(4, 6, 9))
# A tibble: 3 × 3
id start end
<chr> <dbl> <dbl>
1 A 2 4
2 D 5 6
3 H 7 9
And the desired output:
# A tibble: 8 × 5
id position zip zap zop
<chr> <int> <chr> <chr> <chr>
1 A 2 s u w
2 A 3 n e a
3 A 4 c h h
4 D 5 j j w
5 D 6 m e z
6 H 7 m v h
7 H 8 e q w
8 H 9 v j y
What's the most efficient way to subset data by id and the range of position given by key? I can think of two approaches, but neither is very fast.
1. apply() across rows of key, and bind the pieces
apply(X = key, MARGIN = 1, function(x) {
data |>
dplyr::filter(id == x[1],
position %in% x[2]:x[3])
}
) |> dplyr::bind_rows()
2. pivot and fill key, then join()
key |> tidyr::pivot_longer(cols = c(start, end),
values_to = "position") |>
dplyr::select(id, position) |>
dplyr::group_by(id) |>
tidyr::complete(position = seq(from = min(position),
to = max(position))) |>
dplyr::left_join(data)
What tidy approach would likely be fastest given data with millions of lines and a key with hundreds?
We may do an inner_join and then slice after grouping
library(dplyr)
inner_join(data, key) %>%
group_by(id) %>%
slice(first(start):first(end)) %>%
ungroup %>%
select(-c(start, end))
-output
# A tibble: 8 × 5
id position zip zap zop
<chr> <int> <chr> <chr> <chr>
1 A 2 s u w
2 A 3 n e a
3 A 4 c h h
4 D 5 j j w
5 D 6 m e z
6 H 7 m v h
7 H 8 e q w
8 H 9 v j y
Or another option is to make use of cur_group() after grouping by 'id' to subset the 'key' row
data %>%
filter(id %in% key$id) %>%
group_by(id) %>%
filter(row_number() >= key$start[match(cur_group()$id, key$id)],
row_number() <= key$end[match(cur_group()$id, key$id)] ) %>%
ungroup
-output
# A tibble: 8 × 5
id position zip zap zop
<chr> <int> <chr> <chr> <chr>
1 A 2 s u w
2 A 3 n e a
3 A 4 c h h
4 D 5 j j w
5 D 6 m e z
6 H 7 m v h
7 H 8 e q w
8 H 9 v j y
I did some benchmarking of my methods and the methods provided by akrun. Overall, it seems like the function that uses inner_join is most efficient.
Load libraries and create mock data d_ and key k_
library(tidyverse)
library(microbenchmark)
set.seed(123)
d_ <- tibble(id = rep(LETTERS[1:20], each = 1000),
position = rep(1:1000, 20))
k_ <- tibble(id = LETTERS[1:20],
start = as.double(sample(500,20)),
end = start + 300)
Write different methods as functions
method1 <- function(data, key) {
apply(X = key, MARGIN = 1, function(x) {
data |> dplyr::filter(id == x[1],
position %in% x[2]:x[3])
}
) |> dplyr::bind_rows()
}
method2 <- function(data, key) {
key |> tidyr::pivot_longer(cols = c(start, end),
values_to = "position") |>
dplyr::select(id, position) |>
dplyr::group_by(id) |>
tidyr::complete(position = seq(from = min(position),
to = max(position))) |>
dplyr::left_join(data)
}
method3 <- function(data, key) {
dplyr::inner_join(data, key) |>
group_by(id) |>
dplyr::slice(dplyr::first(start):dplyr::first(end)) |>
dplyr::ungroup() |>
dplyr::select(-c(start, end))
}
method4 <- function(data, key) {
data |>
dplyr::filter(id %in% key$id) |>
dplyr::group_by(id) |>
dplyr::filter(dplyr::row_number() >= key$start[match(dplyr::cur_group()$id,
key$id)],
dplyr::row_number() <= key$end[match(dplyr::cur_group()$id,
key$id)]
) |>
dplyr::ungroup()
}
Evaluate each function 100 times with microbenchmark
mbm <- microbenchmark("acvill 1" = { method1(d_, k_) },
"acvill 2" = { method2(d_, k_) },
"akrun 1" = { method3(d_, k_) },
"akrun 2" = { method4(d_, k_) },
times = 100)
Plot benchmarking results
ggplot(data = tibble(method = mbm$expr, time = mbm$time)) +
geom_violin(mapping = aes(x = method, y = time/10^6, fill = method)) +
ylab("milliseconds") +
theme_classic() +
scale_x_discrete(limits = rev) +
scale_y_continuous(limits = c(0,400),
breaks = seq(0,400,50)) +
theme(axis.title.y = element_blank(),
axis.text = element_text(color = "black", size = 10),
legend.position = "none") +
coord_flip()
I have a data frame df1. I would like to find the minimum turning point at each column, where the value before and after the minimum point is larger than it. For example in x=c(2,5,3,6,1,1,1), I would like to determine that the minimum turning point is at 3, but with the min function, I am only able to find the minimum point which is 1. If there is no minimum point, I would like to get NA. Thanks.
> df
structure(list(x = c(2, 5, 3, 6, 1, 1, 1), y = c(6, 9, 3, 6,
3, 1, 1), z = c(9, 3, 5, 1, 4, 6, 2)), row.names = c(NA, -7L), class = c("tbl_df",
"tbl", "data.frame"))
df1>
x y z
2 6 9
5 9 3
3 3 5
6 6 1
1 3 4
1 1 6
1 1 2
Desired result as shown below.
df2>
x y z
3 3 1
You can use lead and lag to compare current value with previous and next value.
library(dplyr)
df %>% summarise(across(.fns = ~min(.x[which(lag(.x) > .x & lead(.x) > .x)])))
# x y z
# <dbl> <dbl> <dbl>
#1 3 3 1
You can use diff, get the sign than diff again to get the valleys. Use min to get the lowest valey.
#Value
sapply(df, function(x) min(x[1+which(diff(sign(diff(x))) == 2)]))
#x y z
#3 3 1
#Position
sapply(df, function(x) {
tt <- 1+which(diff(sign(diff(x))) == 2)
tt[which.min(x[tt])] })
#x y z
#3 3 4
But this will work only in case the valley is one position wide.
Am more robust solution will be using the function from Finding local maxima and minima:
peakPosition <- function(x, inclBorders=TRUE) {
if(inclBorders) {y <- c(min(x), x, min(x))
} else {y <- c(x[1], x)}
y <- data.frame(x=sign(diff(y)), i=1:(length(y)-1))
y <- y[y$x!=0,]
idx <- diff(y$x)<0
(y$i[c(idx,F)] + y$i[c(F,idx)] - 1)/2
}
#Value
sapply(df, function(x) min(x[ceiling(peakPosition(-x, FALSE))]))
#x y z
#3 3 1
#Position
sapply(df, function(x) {
tt <- peakPosition(-x, FALSE)
tt[which.min(x[floor(tt)])] })
#x y z
#3 3 4
An alternative would be to use rle:
x <- c(8,9,3,3,8,1,1)
y <- rle(x)
i <- 1 + which(diff(sign(diff(y$values))) == 2)
min(y$values[i]) #Value
#[1] 3
j <- which.min(y$values[i])
1+sum(y$lengths[seq(i[j])-1]) #First Position
#[1] 3
sum(y$lengths[seq(i[j])]) #Last Position
#[1] 4
Alternate approach
df %>% summarise_all(~ifelse(min(.)==last(.) | min(.) == first(.), min(.[. != last(.) & . != first(.)]), min(.)))
x y z
1 3 3 1
For returning the row_nums
df %>% mutate_all(~ifelse(min(.)==last(.) | min(.) == first(.), min(.[. != last(.) & . != first(.)]), min(.))) %>%
mutate(id = row_number()) %>% left_join(df %>% mutate(id = row_number()), by = "id") %>%
mutate(x_r = ifelse(x.x == x.y, row_number(), 0),
y_r = ifelse(y.x == y.y, row_number(), 0),
z_r = ifelse(z.x == z.y, row_number(), 0)) %>%
select(ends_with("r")) %>% summarise_all(~min(.[. != 0]))
x_r y_r z_r
1 3 3 4
```
I'm trying to combine columns in my data frame so that they give me a certain string. I have columns titled as "C", "H", "O", "N", and "S" as elements. Within those columns are listed the number of elements within that molecule, but I want to exclude some elements depending on their value. For example when there is no Oxygens the value is 0, so i want to exclude this when combining the elements to make a string.
#This is a portion of my data frame titled data4a
C H O N S
3 4 0 0 1
7 5 4 1 0
#The code I have is
data4a$NewComp = paste("C",data4a$Total.C,"H", data4a$NewH, "O", data4a$O, "N", data4a$N, "S", data4a$S, sep = "")
#This code gives me this
C H O N S NewComp
3 4 0 0 1 C3H4O0N0S1
7 5 4 1 0 C7H5O4N1S0
#I expect to see something like this when I print my results
C H O N S NewComp
3 4 0 0 1 C3H4S1
7 5 4 1 0 C7H5O4N
#I want values of zero to be excluded from the string created
An option is apply with argument MARGIN = 1
dat$NewComp <- apply(dat, 1, function(x) {
tmp <- unlist(x)
paste0(names(x)[tmp != 0], tmp[tmp != 0], collapse = "")
})
Result
dat
# C H O N S NewComp
#1 3 4 0 0 1 C3H4S1
#2 7 5 4 1 0 C7H5O4N1
data
dat <- structure(list(C = c(3L, 7L), H = 4:5, O = c(0L, 4L), N = 0:1,
S = c(1L, 0L)), .Names = c("C", "H", "O", "N", "S"), class = "data.frame", row.names = c(NA,
-2L))
Here is a base R solution that solves the question problem and simplifies the creation of the molecule vectors at the same time.
m <- matrix(paste0(names(data4a), t(as.matrix(data4a))),
ncol = ncol(data4a), byrow = TRUE)
m <- apply(m, 1, paste, collapse = "")
data4a$NewComp <- gsub(".0", "", m)
data4a
# C H O N S NewComp
#1 3 4 0 0 1 C3H4S1
#2 7 5 4 1 0 C7H5O4N1
Data.
data4a <- read.table(text = "
C H O N S
3 4 0 0 1
7 5 4 1 0
", header = TRUE)
Another approach could be to use which and create a new dataframe with row number column number and value of the data which is not 0. We then replace the column number with column names and then use aggregate by row number to paste formula together.
df1 <- which(df != 0, arr.ind = TRUE)
df2 <- cbind.data.frame(df1, value = df[df != 0])
df2$col <- names(df)[df2$col]
df$NewComp <- aggregate(paste0(df2$col, df2$value), list(df2$row),
paste0, collapse = "")[, 2]
df
# C H O N S NewComp
#1 3 4 0 0 1 C3H4S1
#2 7 5 4 1 0 C7H5O4N1
As it has been mentioned in comments of other answer if you have data only in selected columns use df[selected_columns] in the first statement of which.
One possibility involving tidyverse could be:
df %>%
rowid_to_column() %>%
gather(var, val, -rowid) %>%
filter(val != 0) %>%
group_by(rowid) %>%
summarise(NewComp = paste0(paste0(var, val), collapse = "")) %>%
left_join(df %>%
rowid_to_column(), by = c("rowid" = "rowid")) %>%
ungroup() %>%
select(-rowid)
NewComp C H O N S
<chr> <int> <int> <int> <int> <int>
1 C3H4S1 3 4 0 0 1
2 C7H5O4N1 7 5 4 1 0
Or:
df %>%
rowid_to_column() %>%
gather(var, val, -rowid) %>%
filter(val != 0) %>%
group_by(rowid) %>%
mutate(NewComp = paste0(paste0(var, val), collapse = "")) %>%
spread(var, val, fill = 0) %>%
ungroup() %>%
select(-rowid)
Sample data:
df <- read.table(text = "C H O N S
3 4 0 0 1
7 5 4 1 0",
header = TRUE,
stringsAsFactors = FALSE)
I want to manually create a tibble where one column values are calculated depending on the previous value of the same column.
For example:
tibble(
x = 1:5,
y = x + lag(y, default = 0)
)
I expect the following result:
# A tibble: 5 x 2
x y
<int> <dbl>
1 1 1
2 2 3
3 3 6
4 4 10
5 5 15
But I obtain the error:
Error in lag(y, default = 0) : object 'y' not found
Update - more real example:
tibble(
years = 1:5,
salary = 20000 * (1.01) ^ lag(years, default = 0),
qta = salary * 0.06
) %>%
mutate(
total = ifelse(row_number() == 1,
(qta + 50000) * (1.02),
(qta + lag(total, default = 0)) * (1.02))
)
In this example I have a tibble, and I want to add a column 'total' that is defined depending on its previous value, but the lag(total, default = 0) doesn't work.
We can use accumulate
library(tidyverse)
tibble(x = 1:5, y = accumulate(x, `+`))
# A tibble: 5 x 2
# x y
# <int> <int>
#1 1 1
#2 2 3
#3 3 6
#4 4 10
#5 5 15
For a general function, it would be
tibble(x = 1:5, y = accumulate(x, ~ .x + .y))
We can also specify the initialization value
tibble(x = 1:5, y = accumulate(x[-1], ~ .x + .y, .init = x[1]))
You're missing x instead of y in the lag() function to run without an error:
tibble(
x = 1:5,
y = x + lag(x, default = 0)
)
But as per #Ronak Shah's comment, you need the cumsum() function to get the same result as your example:
tibble(
x = 1:5,
y = cumsum(x)
)
df1 <-
data.frame(c("male", "female", "male"),
c("1", "2", "3", "4", "5", "6"),
seq(141, 170))
names(df1) = c("gender", "age", "height")
df1$age <- factor(
df1$age,
levels = c(1, 2, 3, 4, 5, 6),
labels = c("16-24", "25-34", "35-44", "45-54", "55-64", "65+")
)
q1a = c(1, 0, 1, 0, 0, 1)
q1b = c(0, 0, 2, 2, 2, 0)
q1c = c(0, 0, 3, 3, 0, 3)
# 1,2 and 3 used to be compatible with existing datasets.
# Could change all to 1 if necessary.
df2 <- data.frame(q1a = q1a, q1b = q1b, q1c = q1c)
df1 <- cbind(df1, df2)
rm(q1a, q1b, q1c, df2)
I am looking to replicate the analysis of multiple response questions from SPSS in R.
At the moment I am using this code:
#creating function for analysing questions with grouped data
multfreqtable <- function(a, b, c) {
# number of respondents (for percent of cases)
totrep = sum(a == 1 | b == 2 | c == 3)
#creating frequency table
table_a = data.frame("a", sum(a == 1))
names(table_a) = c("question", "freq")
table_b = data.frame("b", sum(b == 2))
names(table_b) = c("question", "freq")
table_c = data.frame("c", sum(c == 3))
names(table_c) = c("question", "freq")
table_question <- rbind(table_a, table_b, table_c)
#remove individual question tables
rm(table_a, table_b, table_c)
#adding total
total = as.data.frame("Total")
totalsum = (sum(table_question$freq, na.rm = TRUE))
totalrow = cbind(total, totalsum)
names(totalrow) = c("question", "freq")
table_question = rbind(table_question, totalrow)
#adding percentage column to frequency table
percentcalc = as.numeric(table_question$freq)
percent = (percentcalc / totalsum) * 100
table_question <- cbind(table_question, percent)
#adding percent of cases column to frequency table
poccalc = as.numeric(table_question$freq)
percentofcases = (poccalc / totrep) * 100
table_question <- cbind(table_question, percentofcases)
#print percent of cases value
total_respondents <<- data.frame(totrep)
#remove all unnecessary data and values
rm(
total,
totalsum,
totalrow,
b,
c,
percent,
percentcalc,
percentofcases,
totrep,
poccalc
)
return(table_question)
}
#calling function - must tie to data.frame using $ !!!
q1_frequency <- multfreqtable(df1$q1a, df1$q1b, df1$q1c)
#renaming percent of cases - This is very important while using current method
total_respondents_q1 <- total_respondents
rm(total_respondents)
Producing this table as a result:
I am looking for a more efficient method of doing this that ideally would not require the function to be edited if there were more or less multiple choice questions.
Your function is actually far too complicated for what you need to do. I think a function like this should work and be more flexible.
multfreqtable = function(data, question.prefix) {
# Find the columns with the questions
a = grep(question.prefix, names(data))
# Find the total number of responses
b = sum(data[, a] != 0)
# Find the totals for each question
d = colSums(data[, a] != 0)
# Find the number of respondents
e = sum(rowSums(data[,a]) !=0)
# d + b as a vector. This is your overfall frequency
f = as.numeric(c(d, b))
data.frame(question = c(names(d), "Total"),
freq = f,
percent = (f/b)*100,
percentofcases = (f/e)*100 )
}
Add another question to your example dataset:
set.seed(1); df1$q2a = sample(c(0, 1), 30, replace=T)
set.seed(2); df1$q2b = sample(c(0, 2), 30, replace=T)
set.seed(3); df1$q2c = sample(c(0, 3), 30, replace=T)
Make a table for "q1" responses:
> multfreqtable(df1, "q1")
question freq percent percentofcases
1 q1a 15 33.33333 60
2 q1b 15 33.33333 60
3 q1c 15 33.33333 60
4 Total 45 100.00000 180
Make a table for "q2" responses:
> multfreqtable(df1, "q2")
question freq percent percentofcases
1 q2a 14 31.11111 53.84615
2 q2b 13 28.88889 50.00000
3 q2c 18 40.00000 69.23077
4 Total 45 100.00000 173.07692
Tables for multiple questions
Here's a modified version of the function that allows you to create a list of tables for multiple questions at once:
multfreqtable = function(data, question.prefix) {
z = length(question.prefix)
temp = vector("list", z)
for (i in 1:z) {
a = grep(question.prefix[i], names(data))
b = sum(data[, a] != 0)
d = colSums(data[, a] != 0)
e = sum(rowSums(data[,a]) !=0)
f = as.numeric(c(d, b))
temp[[i]] = data.frame(question = c(sub(question.prefix[i],
"", names(d)), "Total"),
freq = f,
percent = (f/b)*100,
percentofcases = (f/e)*100 )
names(temp)[i] = question.prefix[i]
}
temp
}
Examples:
> multfreqtable(df1, "q1")
$q1
question freq percent percentofcases
1 a 15 33.33333 60
2 b 15 33.33333 60
3 c 15 33.33333 60
4 Total 45 100.00000 180
> test1 = multfreqtable(df1, c("q1", "q2"))
> test1
$q1
question freq percent percentofcases
1 a 15 33.33333 60
2 b 15 33.33333 60
3 c 15 33.33333 60
4 Total 45 100.00000 180
$q2
question freq percent percentofcases
1 a 14 31.11111 53.84615
2 b 13 28.88889 50.00000
3 c 18 40.00000 69.23077
4 Total 45 100.00000 173.07692
> test1$q1
question freq percent percentofcases
1 a 15 33.33333 60
2 b 15 33.33333 60
3 c 15 33.33333 60
4 Total 45 100.00000 180
I've noticed that this is post is quite old, however I couldn’t find a more up to date solution. Here's my version based on dplyr/tidyverse approach.
mult_resp = function(df1, mv_q = c("q1a", "q1b", "q1c")){
df2 = df1 %>%
mutate(id = rownames(.)) %>% #row id for counting n_cases
select(id, everything()) %>%
mutate_at(mv_q, ~ ifelse(. != 0, 1, 0)) %>%
gather(question, resp,-id, -gender,-age,-height)
#count number of cases excluding "all zeros" cases
n_cases = df2 %>% group_by(id) %>%
summarise(n = sum(resp)) %>%
summarise(sum(n > 0))
#output table
res = df2 %>%
group_by(question) %>%
summarise(freq = sum(resp)) %>%
mutate(
percent = freq/sum(freq) *100,
percent_of_cases = freq/as.numeric(n_cases)*100
) %>%
rbind(.,
data.frame(question ="Total",
freq =sum(.$freq, na.rm=TRUE),
percent =sum(.$percent, na.rm=TRUE),
percent_of_cases = sum(.$percent_of_cases, na.rm=TRUE)
)
)
res
}
Example:
> mult_resp(df1, mv_q = c("q1a", "q1b", "q1c"))
# A tibble: 4 x 4
question freq percent percent_of_cases
<chr> <dbl> <dbl> <dbl>
1 q1a 15 33.3 60
2 q1b 15 33.3 60
3 q1c 15 33.3 60
4 Total 45 100. 180
It's an old question. However, you can use userfriendlyscience package to analyze multiple responses survey data very easily.
library(userfriendlyscience)
multiResponse (data, c('v1', 'v2', 'v3'))