Frequency count for a specific category

Frequency count for a specific category - r

I have a data set like this.
a <- structure(list(Prone = c("M", "N", "N", "N", "M", "N", "M", "N", "M", "M"),
Type = c("A", "B", "C", "A", "A", "A", "B", "B", "C", "B"),
Alc = c("A", "B", "N", "A", "A", "A", "B", "B", "B", "B"),
Com = c("Y", "N", "Y", "Y", "Y", "Y", "Y", "N", "N", "Y")),
.Names = c("Prone", "Type", "Alc", "Com"), row.names = c(NA, -10L), class = "data.frame")
a
Prone Type Alc Com
1 M A A Y
2 N B B N
3 N C N Y
4 N A A Y
5 M A A Y
6 N A A Y
7 M B B Y
8 N B B N
9 M C B N
10 M B B Y
I like to get the frequency count of each unique row like the following:
Prone Type Alc Com Freq
1 M A A Y 2
2 M B B Y 2
3 M C B N 1
4 N A A Y 2
5 N B B N 2
6 N C N Y 1
Thanks in advance.

Alternate plyr solution:
> library("plyr")
> count(a)
Prone Type Alc Com freq
1 M A A Y 2
2 M B B Y 2
3 M C B N 1
4 N A A Y 2
5 N B B N 2
6 N C N Y 1

The mandatory data.table solution:
library(data.table)
dt = data.table(a)
dt[, list(Freq = .N), by = names(dt)]

There are a lot of ways to do this, here is a simple plyr example:
> library(plyr)
> ddply(a,names(a),summarize,Freq=length(Prone))
Prone Type Alc Com Freq
1 M A A Y 2
2 M B B Y 2
3 M C B N 1
4 N A A Y 2
5 N B B N 2
6 N C N Y 1

Using base aggregate:
aggregate(data = transform(a, Freq = seq_len(nrow(a))), Freq ~ ., length)
Prone Type Alc Com Freq
1 N B B N 2
2 M C B N 1
3 M A A Y 2
4 N A A Y 2
5 M B B Y 2
6 N C N Y 1

Here's another approach:
library(qdap)
colsplit2df(data.frame(table(paste2(a))), new.names = names(a))
## > colsplit2df(data.frame(table(paste2(a))), new.names = names(a))
## Prone Type Alc Com Freq
## 1 M A A Y 2
## 2 M B B Y 2
## 3 M C B N 1
## 4 N A A Y 2
## 5 N B B N 2
## 6 N C N Y 1

Related

Combining multiple column/ stacking multiple columns

So I have 9 column
a b c d e f g h i
1 1 t p 1 h p 1 v g
2 2 e h 2 j m 2 c f
3 3 f g 3 k l 3 b d
and i want to know how can I make them like this
a b c
1 1 t p
2 2 e h
3 3 f g
4 1 h p
5 2 j m
6 3 k l
7 1 v g
8 2 c f
9 3 b d

We can use reshape from base R by specifying the columns to combine together in a list of vectors
out <- reshape(df1, direction = 'long',
varying = list(c('a', 'd', 'g'), c('b', 'e', 'h'),
c('c', 'f', 'i')))[c('a', 'b', 'c')]
row.names(out) <- NULL
out
# a b c
#1 1 t p
#2 2 e h
#3 3 f g
#4 1 h p
#5 2 j m
#6 3 k l
#7 1 v g
#8 2 c f
#9 3 b d
Or using melt from data.table
library(data.table)
melt(setDT(df1), measure = list(c('a', 'd', 'g'), c('b', 'e', 'h'),
c('c', 'f', 'i')), value.name = c('a', 'b', 'c'))[, variable := NULL][]
data
df1 <- structure(list(a = 1:3, b = c("t", "e", "f"), c = c("p", "h",
"g"), d = 1:3, e = c("h", "j", "k"), f = c("p", "m", "l"), g = 1:3,
h = c("v", "c", "b"), i = c("g", "f", "d")),
class = "data.frame", row.names = c("1",
"2", "3"))

One option involving purrr could be:
map_dfc(.x = split.default(df, rep(1:3, length.out = length(df))),
~ stack(.)[1]) %>%
setNames(c("a", "b", "c"))
a b c
1 1 t p
2 2 e h
3 3 f g
4 1 h p
5 2 j m
6 3 k l
7 1 v g
8 2 c f
9 3 b d

Transform df into edges df for collaboration network

I have this df, which contains information on collaboration of articles:
author author2 author3 author4
1 A D E F
2 B G
3 C H F
I need to create an edges dataframe, which contains the relationship between the authors, like this:
from to
1 A D
2 A E
3 A F
4 B G
5 C H
6 C F
7 D E
8 D F
9 E F
11 H F
any ideas how to do it?

We can gather each column against the remaining columns i.e. to the left of that column and then binds all.
library(tidyverse)
map_dfr(names(df)[-length(df)], ~select(df,.x:ncol(df)) %>% gather( k,to,-.x) %>%
arrange(!!ensym(.x)) %>% select(-k) %>% filter(to!='') %>%
rename(form=starts_with('author')))
form to
1 A D
2 A E
3 A F
4 B G
5 C H
6 C F
7 D E
8 D F
9 H F
10 E F
Data
df <- structure(list(author = c("A", "B", "C"), author2 = c("D", "G",
"H"), author3 = c("E", "", "F"), author4 = c("F","", "")), class = "data.frame", row.names = c("1",
"2", "3"))

You could apply combn row-wise inside a function, no need for packages.
edges <- setNames(as.data.frame(do.call(rbind, lapply(seq(nrow(d)), function(x)
matrix(unlist(t(combn(na.omit(unlist(d[x, ])), 2))), ncol=2)))), c("from", "to"))
edges
# from to
# 1 A D
# 2 A E
# 3 A F
# 4 D E
# 5 D F
# 6 E F
# 7 B G
# 8 C H
# 9 C F
# 10 H F
Or, using igraph package as #akrun suggested.
library(igraph)
edges <- do.call(rbind, apply(d, 1, function(x)
as_data_frame(graph_from_data_frame(t(combn(na.omit(x), 2))))))
edges
# from to
# 1 A D
# 2 A E
# 3 A F
# 4 D E
# 5 D F
# 6 E F
# 7 B G
# 8 C H
# 9 C F
# 10 H F
Data
d <- structure(list(author = c("A", "B", "C"), author2 = c("D", "G",
"H"), author3 = c("E", NA, "F"), author4 = c("F", NA, NA)), row.names = c(NA,
-3L), class = "data.frame")

replacing multiple values in data frame in R

I want to reassign multiple different character strings with the same value in a single call. However, the following code only replaces some of values in each variable.
dat <-data.frame(x=c(rep("1=x",4),rep("b",4)),y=c(rep("1=z",4),rep("b",4)))
dat[] <- sapply(dat[], as.character)
dat[dat == c("1=x", "1=y")]<- 1
such that I get:
dat
x y
1 1 1=z
2 1=x 1=z
3 1 1=z
4 1=x 1=z
5 b b
6 b b
7 b b
8 b b
when I want is the following:
dat
x y
1 1 1
2 1 1
3 1 1
4 1 1
5 b b
6 b b
7 b b
8 b b

With dplyr:
library(dplyr)
dat <- mutate_all(dat, funs(replace(., grepl("1=", .), 1)))
With Base R:
dat[] <- lapply(dat, function(x) replace(x, grepl("1=", x), 1))
Result:
x y
1 1 1
2 1 1
3 1 1
4 1 1
5 b b
6 b b
7 b b
8 b b
Data:
dat <- structure(list(x = c("1=x", "1=x", "1=x", "1=x", "b", "b", "b",
"b"), y = c("1=z", "1=z", "1=z", "1=z", "b", "b", "b", "b")), .Names = c("x",
"y"), row.names = c(NA, -8L), class = "data.frame")

Another Base R option if you want to make an explicit replacement of certain strings would be:
dat[] <- lapply(dat,function(x) ifelse(x %in% c("1=x", "1=z"), 1, x))
Result:
x y
1 1 1
2 1 1
3 1 1
4 1 1
5 b b
6 b b
7 b b
8 b b
Data:
dat <- structure(list(x = c("1", "1", "1", "1", "b", "b", "b", "b"),
y = c("1", "1", "1", "1", "b", "b", "b", "b")), row.names = c(NA,
-8L), class = "data.frame")

joining the first n factors (with different n) in R

A data frame contains ID, group, n (numeric), and several factor variables
ID <- c(1,2,3,4,5,6,7,8,9,10)
group <- c("m", "m", "m", "f", "f", "m", "m", "f", "f", "m")
n <- c(1,2,6,3,6,8,4,1,4,2)
b1 <- c("a", "b", "", "a", "d", "d", "a", "c", "c", "b")
b2 <- c("a", "", "e", "a", "d", "d", "a", "c", "c", "b")
b3 <- c("a", "b", "", "a", "", "d", "a", "c", "c", "b")
b4 <- c("a", "b", "e", "a", "", "d", "a", "c", "c", "b")
b5 <- c("a", "b", "e", "a", "d", "", "", "", "c", "b")
b6 <- c("a", "", "", "", "d", "d", "", "c", "c", "b")
df <- data.frame(ID, group, n, b1, b2, b3, b4, b5, b6)
I need to create a new character column (call it y).
They way to compute y is by joining the first n variables (b1,b2,b3,b4,b5,b6) and use comma to seperate them.
Note, in case a column is a blank, then remove it from the join.
For example, for ID=1, y = "a"; for ID = 2, y = "b" (not "b, "); for ID = 3, y = "e,e,e", etc.
And, the faster the code, the better.

A possible sollution, the speed might still be an issue:
df$y <- sapply(seq_len(nrow(df)), function(i){
cvec <- head(unlist(df[i, 4:9]), df$n[i])
cvec <- cvec[!cvec == '']
paste(cvec, collapse = ',')
})
# ID group n b1 b2 b3 b4 b5 b6 y
# 1 1 m 1 a a a a a a a
# 2 2 m 2 b b b b b
# 3 3 m 6 e e e e,e,e
# 4 4 f 3 a a a a a a,a,a
# 5 5 f 6 d d d d d,d,d,d
# 6 6 m 8 d d d d d d,d,d,d,d
# 7 7 m 4 a a a a a,a,a,a
# 8 8 f 1 c c c c c c
# 9 9 f 4 c c c c c c c,c,c,c
# 10 10 m 2 b b b b b b b,b

Here is an option using gsub and paste. We paste the 'b' columns of 'df' (do.call(paste0, df[-(1:3)]), then use substring to keep only the characters that suggested by 'n' column, use gsub to create the , in between each character.
df$y <- gsub("(?<=\\S)(?=\\S)", ",",
substring(do.call(paste0, df[-(1:3)]), 1, df$n), perl = TRUE)
df
# ID group n b1 b2 b3 b4 b5 b6 y
#1 1 m 1 a a a a a a a
#2 2 m 2 b b b b b,b
#3 3 m 6 e e e e,e,e
#4 4 f 3 a a a a a a,a,a
#5 5 f 6 d d d d d,d,d,d
#6 6 m 8 d d d d d d,d,d,d,d
#7 7 m 4 a a a a a,a,a,a
#8 8 f 1 c c c c c c
#9 9 f 4 c c c c c c c,c,c,c
#10 10 m 2 b b b b b b b,b

df$y <- apply(df, 1, function(r) {
gsub("\\s+", "\\,", trimws(paste(head(r[4:9], r["n"]), sep= " ", collapse = " ")))})
df
# ID group n b1 b2 b3 b4 b5 b6 y
# 1 1 m 1 a a a a a a a
# 2 2 m 2 b b b b b
# 3 3 m 6 e e e e,e,e
# 4 4 f 3 a a a a a a,a,a
# 5 5 f 6 d d d d d,d,d,d
# 6 6 m 8 d d d d d d,d,d,d,d
# 7 7 m 4 a a a a a,a,a,a
# 8 8 f 1 c c c c c c
# 9 9 f 4 c c c c c c c,c,c,c
# 10 10 m 2 b b b b b b b,b

Merging two data frames with different sizes by matching their columns

I am trying to "merge" column V of a Data Frame in another one if the columns X and Y are equals (I have to match dOne.X == dTwo.X & dOne.Y == dTwo.Y and also dOne.X == dTwo.Y & dOne.Y == dTwo.X)
I solved this using a for loop, but it is slow when the Data Frame dOne is big (in my machine it takes 25 minutes if length(dOne.X) == 500000). I would like to know if there is a way to solve this problem using a faster "vectorized" operation. Above is an exemple of what I want to do:
Data Frame ONE
X Y V
a b 2
a c 3
a d 0
a e 0
b c 2
b d 3
b e 0
c d 2
c e 0
d e 0
Data Frame TWO
X Y V
a b 1
a c 1
a d 1
b c 1
b d 1
c d 1
e d 1
Expected Data Frame after the columns are merged
X Y V V2
a b 2 1
a c 3 1
a d 0 1
a e 0 0
b c 2 1
b d 3 1
b e 0 0
c d 2 1
c e 0 0
d e 0 1
This is the code I am using so far that is slow when dOne is big (hundreds of thousands or rows):
copyadjlistValueColumn <- function(dOne, dTwo) {
dOne$V2 <- 0
lv <- union(levels(dOne$Y), levels(dOne$X))
dTwo$X <- factor(dTwo$X, levels = lv)
dTwo$Y <- factor(dTwo$Y, levels = lv)
dOne$X <- factor(dOne$X, levels = lv)
dOne$Y <- factor(dOne$Y, levels = lv)
for(i in 1:nrow(dTwo)) {
row <- dTwo[i,]
dOne$V2[dOne$X == row$X & dOne$Y == row$Y] <- row$V
dOne$V2[dOne$X == row$Y & dOne$Y == row$X] <- row$V
}
dOne
}
This is a testthat test case that covers what I am expecting (using the data frames above):
test_that("Copy V column to another Data Frame", {
dfOne <- data.frame(X=c("a", "a", "a", "a", "b", "b", "b", "c", "c", "d"),
Y=c("b", "c", "d", "e", "c", "d", "e", "d", "e", "e"),
V=c(2, 3, 0, 0, 2, 3, 0, 2, 0, 0))
dfTwo <- data.frame(X=c("a", "a", "a", "b", "b", "c", "e"),
Y=c("b", "c", "d", "c", "d", "d", "d"),
V=c(1, 1, 1, 1, 1, 1, 1))
lv <- union(levels(dfTwo$Y), levels(dfTwo$X))
dfExpected <- data.frame(X=c("a", "a", "a", "a", "b", "b", "b", "c", "c", "d"),
Y=c("b", "c", "d", "e", "c", "d", "e", "d", "e", "e"),
V=c(2, 3, 0, 0, 2, 3, 0, 2, 0, 0),
V2=c(1, 1, 1, 0, 1, 1, 0, 1, 0, 1))
dfExpected$X <- factor(dfExpected$X, levels = lv)
dfExpected$Y <- factor(dfExpected$Y, levels = lv)
dfMerged <- copyadjlistValueColumn(dfOne, dfTwo)
expect_identical(dfMerged, dfExpected)
})
Any suggestion?
Thanks a lot :)

Try two merge, where order of matching columns is reversed in the second, to get the 'bidirectional' matching. Then you may use e.g. rowSums to collapse the two created columns to one.
d1 <- merge(dfOne, dfTwo, by.x = c("X", "Y"), by.y = c("X", "Y"), all.x = TRUE)
d2 <- merge(d1, dfTwo, by.x = c("X", "Y"), by.y = c("Y", "X"), all.x = TRUE)
cbind(dfOne, V2 = rowSums(cbind(d2$V.y, d2$V), na.rm = TRUE))
# X Y V V2
# 1 a b 2 1
# 2 a c 3 1
# 3 a d 0 1
# 4 a e 0 0
# 5 b c 2 1
# 6 b d 3 1
# 7 b e 0 0
# 8 c d 2 1
# 9 c e 0 0
# 10 d e 0 1
For faster alternatives to merge, check data.table and dplyr alternatives here: stackoverflow.com/questions/1299871/how-to-join-data-frames-in-r-inner-outer-left-right/

Here's a possible data.table package approach. This approach should be particularly efficient for a big data set like you have:
First convert to data.table object and add keys
library(data.table)
setkey(setDT(dfOne), X, Y)
setkey(setDT(dfTwo), X, Y)
Then perform a join on X & Y combination - the join is performed by matching key columns X,Y of dfOne with key columns X,Y of dfTwo respectively.
dfOne[dfTwo, V2 := i.V]
Now perform a join on Y & X combination - the join is performed by matching key columns X,Y of dfOne with key columns Y,X of dfTwo respectively.
setkey(dfTwo, Y, X)
dfOne[dfTwo, V2 := i.V][]
Result (I'll keep the unmatched as NAs instead of zeroes as it makes more sense this way):
# X Y V V2
# 1: a b 2 1
# 2: a c 3 1
# 3: a d 0 1
# 4: a e 0 NA
# 5: b c 2 1
# 6: b d 3 1
# 7: b e 0 NA
# 8: c d 2 1
# 9: c e 0 NA
# 10: d e 0 1

With dplyr:
library(dplyr)
left_join(dfOne, dfTwo, by = c("X", "Y")) %>%
left_join(dfTwo, by = c("X" = "Y", "Y" = "X")) %>%
mutate(V2 = ifelse(is.na(V.y), V, V.y)) %>%
select(X, Y, V = V.x, V2) %>%
do(replace(., is.na(.), 0))

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Frequency count for a specific category - r

Alternate plyr solution: > library("plyr") > count(a) Prone Type Alc Com freq 1 M A A Y 2 2 M B B Y 2 3 M C B N 1 4 N A A Y 2 5 N B B N 2 6 N C N Y 1

The mandatory data.table solution: library(data.table) dt = data.table(a) dt[, list(Freq = .N), by = names(dt)]

There are a lot of ways to do this, here is a simple plyr example: > library(plyr) > ddply(a,names(a),summarize,Freq=length(Prone)) Prone Type Alc Com Freq 1 M A A Y 2 2 M B B Y 2 3 M C B N 1 4 N A A Y 2 5 N B B N 2 6 N C N Y 1

Using base aggregate: aggregate(data = transform(a, Freq = seq_len(nrow(a))), Freq ~ ., length) Prone Type Alc Com Freq 1 N B B N 2 2 M C B N 1 3 M A A Y 2 4 N A A Y 2 5 M B B Y 2 6 N C N Y 1

Here's another approach: library(qdap) colsplit2df(data.frame(table(paste2(a))), new.names = names(a)) ## > colsplit2df(data.frame(table(paste2(a))), new.names = names(a)) ## Prone Type Alc Com Freq ## 1 M A A Y 2 ## 2 M B B Y 2 ## 3 M C B N 1 ## 4 N A A Y 2 ## 5 N B B N 2 ## 6 N C N Y 1

Related

Combining multiple column/ stacking multiple columns

Transform df into edges df for collaboration network

replacing multiple values in data frame in R

joining the first n factors (with different n) in R

Merging two data frames with different sizes by matching their columns

Categories

Resources