swapping rows and columns in R - r

I have a table that looks like :
> head(test,10)
# A tibble: 10 x 16
Question_1 Question_2 Question_3 Question_4 Question_5 Question_6 Question_7 Question_8 Question_9
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 B C C E C A C E C
2 C C C B C A E D C
3 B C C E C A C E C
4 C C C D C A C D C
5 B B C B A A A D C
6 C C C E BLANK A C E C
7 C C C E C A E E C
8 B C C E C A C D C
9 C C C E C A C D C
10 D C E B C A A D C
and want to transpose so I get one question for each row and in 6 separate columns count of A,B,C,D,E,BLANKS.

We can gather into 'long' format, get the count of the 'key', 'value' columnss and spread it to 'wide' format
library(tidyverse)
gather(test) %>%
count(key, value) %>%
spread(value, n, fill = 0)
Or using melt/dcast
library(data.table)
dcast(melt(setDT(test), measure = patterns("^Question")), variable ~ value)
Or in base R with no looping by replicating the column names of 'test, while unlisting the 'test' and get the table
table(names(test)[col(test)], unlist(test))
# A B BLANK C D E
# Question_1 0 4 0 5 1 0
# Question_2 0 1 0 9 0 0
# Question_3 0 0 0 9 0 1
# Question_4 0 3 0 0 1 6
# Question_5 1 0 1 8 0 0
# Question_6 10 0 0 0 0 0
# Question_7 2 0 0 6 0 2
# Question_8 0 0 0 0 6 4
# Question_9 0 0 0 10 0 0
NOTE: There is no need to trick with a loop
Benchmarks
df2 <- test[rep(seq_len(nrow(test)), 1e5), ]
system.time({
vals <- unique(unlist(df2))
t(sapply(df2, function(x) table(factor(x, levels = vals))))
})
# user system elapsed
# 6.987 0.367 7.293
system.time({
table(names(df2)[col(df2)], unlist(df2))
})
# user system elapsed
# 6.355 0.407 6.720
system.time({
gather(df2) %>%
count(key, value) %>%
spread(value, n, fill = 0)
})
# user system elapsed
# 0.567 0.125 0.695
system.time({
dcast(melt(setDT(df2), measure = patterns("^Question")), variable ~ value)
})
# user system elapsed
# 0.789 0.018 0.195
data
test <- structure(list(Question_1 = c("B", "C", "B", "C", "B", "C", "C",
"B", "C", "D"), Question_2 = c("C", "C", "C", "C", "B", "C",
"C", "C", "C", "C"), Question_3 = c("C", "C", "C", "C", "C",
"C", "C", "C", "C", "E"), Question_4 = c("E", "B", "E", "D",
"B", "E", "E", "E", "E", "B"), Question_5 = c("C", "C", "C",
"C", "A", "BLANK", "C", "C", "C", "C"), Question_6 = c("A", "A",
"A", "A", "A", "A", "A", "A", "A", "A"), Question_7 = c("C",
"E", "C", "C", "A", "C", "E", "C", "C", "A"), Question_8 = c("E",
"D", "E", "D", "D", "E", "E", "D", "D", "D"), Question_9 = c("C",
"C", "C", "C", "C", "C", "C", "C", "C", "C")),
class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10"))

A base R trick could be to get all the unique values of the dataframe and use sapply and count frequency of each value in the column.
vals <- unique(unlist(test))
t(sapply(test, function(x) table(factor(x, levels = vals))))
# B C D E A BLANK
#Question_1 4 5 1 0 0 0
#Question_2 1 9 0 0 0 0
#Question_3 0 9 0 1 0 0
#Question_4 3 0 1 6 0 0
#Question_5 0 8 0 0 1 1
#Question_6 0 0 0 0 10 0
#Question_7 0 6 0 2 2 0
#Question_8 0 0 6 4 0 0
#Question_9 0 10 0 0 0 0

Related

How to count a swap characters between two columns in R

I have a data frame that looks like this
df <- data.frame(col1 = c("A", "A", "A", "A", "A", "B", "B", "B", "B", "B",
"C", "C", "C", "C", "C"),
col2 = c("A", "B", "C", "D", "E", "A", "B", "C", "D", "E",
"A", "B", "C", "D", "E"))
what I want is to have like this
df <- data.frame(col1 = c("A", "A", "A", "A", "A", "B", "B", "B", "B", "B",
"C", "C", "C", "C", "C"),
col2 = c("A", "B", "C", "D", "E", "A", "B", "C", "D", "E",
"A", "B", "C", "D", "E"),
col3 = c("1","0","0","0","0","1","1","0","0","0","1","1","1","0","0"))
In col3, it counts the duplicated characters as 1 and unique as 0. row 6 is considered a duplicate because the swap characters ("B", "A") were counted already in row2 as unique ("A", "B"). I can easily do this in excel using the if and countif function. Thanks in advance!
We can use pmin and pmax to sort the values from left to right by rows and apply duplicated to check the duplicates
transform(
df,
col3 = +(duplicated(paste(pmin(col1, col2), pmax(col1, col2))) | col1 == col2)
)
which gives
col1 col2 col3
1 A A 1
2 A B 0
3 A C 0
4 A D 0
5 A E 0
6 B A 1
7 B B 1
8 B C 0
9 B D 0
10 B E 0
11 C A 1
12 C B 1
13 C C 1
14 C D 0
15 C E 0
Does this work:
df %>% mutate(col4 = str_c(col1, col2)) %>%
mutate(col5 = lapply(col4, function(x) paste(sort(unlist(strsplit(x, ''))), collapse = ''))) %>%
mutate(col3 = +(duplicated(col5) | (col1 == col2))) %>%
select(col1, col2, col3)
col1 col2 col3
1 A A 1
2 A B 0
3 A C 0
4 A D 0
5 A E 0
6 B A 1
7 B B 1
8 B C 0
9 B D 0
10 B E 0
11 C A 1
12 C B 1
13 C C 1
14 C D 0
15 C E 0
Here is one option where we look for any duplicates or where col1 and col2 are the same. The + returns a binary for the logical.
df$col3 <- +(duplicated(t(apply(df, 1, sort))) | df$col1 == df$col2)
Output
col1 col2 col3
1 A A 1
2 A B 0
3 A C 0
4 A D 0
5 A E 0
6 B A 1
7 B B 1
8 B C 0
9 B D 0
10 B E 0
11 C A 1
12 C B 1
13 C C 1
14 C D 0
15 C E 0
try this
column <- grepl("^[.0-9]+$", dat[,1])
column
dat2 <- data.frame(Sex = dat[cbind(seq_len(nrow(dat)),1+column)], Length =
dat[cbind(seq_len(nrow(dat)),2-column)])
dat2$Length <- as.numeric(dat2$Length)
dat2

How to generate string counts in different samples by R

Let's say I have a data table as follow:
ID1 ID2 ID3
-------------
a a b
a b b
b b b
c c c
c c d
c d d
d e
d e
e
Then I want to convert it as like following structure:
Samples ID1 ID2 ID3
-------------------
a 2 1 0
b 1 2 3
c 3 2 1
d 2 1 2
e 1 0 2
Would any of you please help me with R or bash code to achieve such transformation?
Try the R code below
> table(stack(df))
ind
values ID1 ID2 ID3
a 2 1 0
b 1 2 3
c 3 2 1
d 2 1 2
e 1 0 2
data
> dput(df)
structure(list(ID1 = c("a", "a", "b", "c", "c", "c", "d", "d",
"e"), ID2 = c("a", "b", "b", "c", "c", "d", NA, NA, NA), ID3 = c("b",
"b", "b", "c", "d", "d", "e", "e", NA)), class = "data.frame", row.names = c(NA,
-9L))
An option with tidyverse - reshape to 'long' format with pivot_longer, get the count and reshape back to 'wide' format with pivot_wider
library(dplyr)
library(tidyr)
df %>%
pivot_longer(everything(), values_drop_na = TRUE, values_to = 'Samples') %>%
count(name, Samples) %>%
pivot_wider(names_from = name, values_from = n, values_fill = 0)
-output
# A tibble: 5 × 4
Samples ID1 ID2 ID3
<chr> <int> <int> <int>
1 a 2 1 0
2 b 1 2 3
3 c 3 2 1
4 d 2 1 2
5 e 1 0 2
data
df <- structure(list(ID1 = c("a", "a", "b", "c", "c", "c", "d", "d",
"e"), ID2 = c("a", "b", "b", "c", "c", "d", NA, NA, NA), ID3 = c("b",
"b", "b", "c", "d", "d", "e", "e", NA)), class = "data.frame",
row.names = c(NA,
-9L))

how to add a column to identify specific combination of values in R?

I have a database with several columns ( >20) and 2 of these columns have the subject names. I would like to add another column with inside a number that identifies the combination of the two subjects.
Here is an example with only the 2 columns of names (I don't include the others for convenience):
ID1 ID2
A B
A C
A B
B C
A B
B A
C B
And here is what i would like to create:
ID1 ID2 CODE
A B 1
A C 2
A B 1
B C 3
A B 1
B A 1
C B 3
I am kind of new in R and I think it can be done with stringr but I am not sure how
Thanks for the help!
Simo
df$CODE <- as.integer(
factor(
apply(df, 1, function(x) paste0(sort(x), collapse = ""))
)
)
# ID1 ID2 CODE
# 1 A B 1
# 2 A C 2
# 3 A B 1
# 4 B C 3
# 5 A B 1
# 6 B A 1
# 7 C B 3
Data
df <- data.frame(
ID1 = c("A", "A", "A", "B", "A", "B", "C"),
ID2 = c("B", "C", "B", "C", "B", "A", "B")
)
Try this:
library(dplyr)
#Code
new <- df %>% rowwise() %>%
mutate(Var = paste0(sort(c(ID1, ID2)), collapse = '')) %>%
group_by(Var) %>%
mutate(CODE=cur_group_id()) %>%
ungroup() %>%
select(-Var)
Output:
# A tibble: 7 x 3
ID1 ID2 CODE
<chr> <chr> <int>
1 A B 1
2 A C 2
3 A B 1
4 B C 3
5 A B 1
6 B A 1
7 C B 3
Some data used:
#Data
df <- structure(list(ID1 = c("A", "A", "A", "B", "A", "B", "C"), ID2 = c("B",
"C", "B", "C", "B", "A", "B")), class = "data.frame", row.names = c(NA,
-7L))

Matching rows to columns and counting same occurences R

I have a dataset which is of the following form:-
a <- data.frame(X1=c("A", "B", "C", "A", "B", "C"),
X2=c("B", "C", "C", "A", "A", "B"),
X3=c("B", "E", "A", "A", "A", "B"),
X4=c("E", "C", "A", "A", "A", "C"),
X5=c("A", "C", "C", "A", "B", "B")
)
And I have another set of the following form:-
b <- data.frame(col_1=c("ASD", "ASD", "BSD", "BSD"),
col_2=c(1, 1, 1, 1),
col_3=c(12, 12, 31, 21),
col_4=("A", "B", "B", "A")
)
What I want to do is to take the column col_4 from set b and match row wise in set a, so that it tell me which row has how many elements from col_4 in a new column. The name of the new column does not matters.
For ex:- The first and fifth row in set a has all the elements of col_4 from set b.
Also, duplicates shouldn't be found. For ex. sixth row in set a has 3 "B"s. But since col_4 from set b has only two "B"s, it should tell me 2 and not 3.
Expected output is of the form:-
c <- data.frame(X1=c("A", "B", "C", "A", "B", "C"),
X2=c("B", "C", "C", "A", "A", "B"),
X3=c("B", "E", "A", "A", "A", "B"),
X4=c("E", "C", "A", "A", "A", "C"),
X5=c("A", "C", "C", "A", "B", "B"),
found=c(4, 1, 2, 2, 4, 2)
)
We can use vecsets::vintersect which takes care of duplicates.
Using apply row-wise we can count how many common values are there between b$col4 and each row in a.
apply(a, 1, function(x) length(vecsets::vintersect(b$col_4, x)))
#[1] 4 1 2 2 4 2
An option using data.table:
library(data.table)
#convert a into a long format
m <- melt(setDT(a)[, rn:=.I], id.vars="rn", value.name="col_4")
#order by row number and create an index for identical occurrences in col_4
setorder(m, rn, col_4)[, vidx := rowid(col_4), rn]
#create a similar index for b
setDT(b, key="col_4")[, vidx := rowid(col_4)]
#count occurrences and lookup this count into original data
a[b[m, on=.(col_4, vidx), nomatch=0L][, .N, rn], on=.(rn), found := N]
output:
X1 X2 X3 X4 X5 rn found
1: A B B E A 1 4
2: B C E C C 2 1
3: C C A A C 3 2
4: A A A A A 4 2
5: B A A A B 5 4
6: C B B C B 6 2
Another idea to operate on sets efficiently is to count and compare the element occurences of b$col_4 in each row of a:
b1 = c(table(b$col_4))
#b1
#A B
#2 2
a1 = table(factor(as.matrix(a), names(b1)), row(a))
#a1
#
# 1 2 3 4 5 6
# A 2 0 2 5 3 0
# B 2 1 0 0 2 3
Finally, identify the least amount of occurences per element (for each row) and sum:
colSums(pmin(a1, b1))
#1 2 3 4 5 6
#4 1 2 2 4 2
In case of a larger dimension a "data.frame" and more elements, Matrix::sparseMatrix offers an appropriate alternative:
library(Matrix)
a.fac = factor(as.matrix(a), names(b1))
.i = as.integer(a.fac)
.j = c(row(a))
noNA = !is.na(.i) ## need to remove NAs manually
.i = .i[noNA]
.j = .j[noNA]
a1 = sparseMatrix(i = .i, j = .j, x = 1L, dimnames = list(names(b1), 1:nrow(a)))
a1
#2 x 6 sparse Matrix of class "dgCMatrix"
# 1 2 3 4 5 6
#A 2 . 2 5 3 .
#B 2 1 . . 2 3
colSums(pmin(a1, b1))
#1 2 3 4 5 6
#4 1 2 2 4 2

Frequency count of 5 rankings in R

Say I have 5 items A, B, C, D, E in a questionnaire and got respondents to rank them. The data looks like this,
> df
rank1 rank2 rank3 rank4 rank5
1 A B C D E
2 A C B D E
3 C A B E D
4 B A C D E
5 A B D C E
How do I count the frequency of each rank by item so the output looks like this,
item rank1 rank2 rank3 rank4 rank5
1 A 3 2 0 0 0
2 B 1 2 2 0 0
3 C 1 1 2 1 0
4 D 0 0 1 3 1
5 E 0 0 0 1 4
We can use table after converting to factor using base R
lvls <- sort(unique(unlist(df)))
sapply(df, function(x) table(factor(x, levels =lvls)))
# rank1 rank2 rank3 rank4 rank5
#A 3 2 0 0 0
#B 1 2 2 0 0
#C 1 1 2 1 0
#D 0 0 1 3 1
#E 0 0 0 1 4
Or calling table only once
table(unlist(df), c(col(df)))
# 1 2 3 4 5
# A 3 2 0 0 0
# B 1 2 2 0 0
# C 1 1 2 1 0
# D 0 0 1 3 1
# E 0 0 0 1 4
Or compactly with mtabulate from qdapTools
library(qdapTools)
t(mtabulate(df))
data
df <- structure(list(rank1 = c("A", "A", "C", "B", "A"), rank2 = c("B",
"C", "A", "A", "B"), rank3 = c("C", "B", "B", "C", "D"), rank4 = c("D",
"D", "E", "D", "C"), rank5 = c("E", "E", "D", "E", "E")), .Names = c("rank1",
"rank2", "rank3", "rank4", "rank5"), class = "data.frame", row.names = c("1",
"2", "3", "4", "5"))
A tidy approach
Here is a way to solve your problem with functions from the tidyverse
library(tidyr)
library(dplyr)
your_data <- tribble(~"rank1", ~"rank2", ~"rank3", ~"rank4", ~"rank5",
"A", "B", "C", "D", "E",
"A", "C", "B", "D", "E",
"C", "A", "B", "E", "D",
"B", "A", "C", "D", "E",
"A", "B", "D", "C", "E")
your_data %>%
gather(key = rank_number, value = rank) %>%
count(rank_number, rank) %>%
spread(key = rank_number, value = n, fill = 0)
#> # A tibble: 5 x 6
#> rank rank1 rank2 rank3 rank4 rank5
#> * <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 A 3. 2. 0. 0. 0.
#> 2 B 1. 2. 2. 0. 0.
#> 3 C 1. 1. 2. 1. 0.
#> 4 D 0. 0. 1. 3. 1.
#> 5 E 0. 0. 0. 1. 4.

Resources