Count unique values by column and group of rows

Count unique values by column and group of rows - r

I have this example: df.Journal.Conferences
venue author0 author1 author2 ... author19
A John Mary
B Peter Jacob Isabella
C Lia
B Jacob Lara John
C Mary
B Isabella
I want to know how many unique authors are in each venue
Result:
A 2
B 5
C 2
Edit:
Here is the link to my data: GoogleDrive Excel sheet.

because your data was hard to reproduce, I generated a "similar" data set,
this should word
set.seed(1984)
df <- data.frame(id = sample(1:5,10, replace= T),
v1 = sample(letters[1:5],10,replace= T),
v2 = sample(letters[1:5],10,replace= T),
v3 = sample(letters[1:5],10,replace= T),
v4 = sample(letters[1:5],10,replace= T),
stringsAsFactors = F)
z <- data.frame( id = unique(df$id), n = NA )
for (i in z$id) {
z$n[z$id == i] <- length(unique(unlist(df[df$id == i,-1])))
}
z
# id n
# 1 4 4
# 2 3 4
# 3 2 4
# 4 5 4
# 5 1 3

Using #zx8754 data for testing, this code gives want you wanted (assuming you have NA for empty cells in the dataframe):
sapply(split(df1[,-1], df1$venue), function(x) length(unique(x[!is.na(x)])))
# A B C
# 2 5 2

Using dplyr and tidyr, reshape the data from wide to long, then group by count.
library(dplyr)
library(tidyr)
gather(df1, key = author, value = name, -venue) %>%
select(venue, name) %>%
group_by(venue) %>%
summarise(n = n_distinct(name, na.rm = TRUE))
# # A tibble: 3 × 2
# venue n
# <chr> <int>
# 1 A 2
# 2 B 5
# 3 C 2
data
df1 <- read.table(text ="
venue,author0,author1,author2
A,John,Mary,NA
B,Peter,Jacob,Isabella
C,Lia,NA,NA
B,Jacob,Lara,John
C,Mary,NA,NA
B,Isabella,NA,NA
", header = TRUE, sep = ",", stringsAsFactors = FALSE)
Edit: Saved your Excel sheet as CSV, then read in using read.csv, then above code returns below output:
df1 <- read.csv("Journal_Conferences_Authors.csv", na.strings = "#N/A")
# output
# # A tibble: 427 × 2
# venue n
# <fctr> <int>
# 1 AAAI 4
# 2 ACC 4
# 3 ACIS-ICIS 5
# 4 ACM SIGSOFT Software Engineering Notes 1
# 5 ACM Southeast Regional Conference 5
# 6 ACM TIST 3
# 7 ACM Trans. Comput.-Hum. Interact. 3
# 8 ACML 2
# 9 ADMA 2
# 10 Advanced Visual Interfaces 3
# # ... with 417 more rows

Related

Renaming a variable through a loop

I am having different files that have a variable that is named differently but has the same string character “type_category” e.g., type_category_lifestyle_characterstics, type_category_uniqueness etc. The idea is to go through these files and rename such variables to type_category. Below are examples of data frames
df1 <- data.frame(id = c(1,2,3), type_category_lifestyle_characterstics = c(5,6,7), rating = c(1,3,4))
df2 <- data.frame(id = c(9,5,3), type_category_uniqueness = c(4,6,1), rating = c(2,7,4))
Thanks in advance

We can get the datasets in a list
library(dplyr)
library(purrr)
out <- map(mget(ls(pattern = '^df\\d+$')), ~ .x %>%
rename_with(~ "type_category",
starts_with("type_category")))
-output
out
$df1
id type_category rating
1 1 5 1
2 2 6 3
3 3 7 4
$df2
id type_category rating
1 9 4 2
2 5 6 7
3 3 1 4

We could use setNames with lapply:
my_list <- list(df1, df2)
colnames <- c("id","type_category","rating")
lapply(my_list, setNames, colnames)
output:
[[1]]
id type_category rating
1 1 5 1
2 2 6 3
3 3 7 4
[[2]]
id type_category rating
1 9 4 2
2 5 6 7
3 3 1 4

Base R
Once you got them in a list, you can use lapply to change the variable names in all of them
df1 <- data.frame(id = c(1,2,3), type_category_lifestyle_characterstics = c(5,6,7), rating = c(1,3,4))
df2 <- data.frame(id = c(9,5,3), type_category_uniqueness = c(4,6,1), rating = c(2,7,4))
lapply(list(df1, df2),
function(df){
nms <- names(df)
nms[grepl(pattern = "type_category",
x = nms,
ignore.case = TRUE)] <- "type_category"
names(df) <- nms
return(df)
})
#> [[1]]
#> id type_category rating
#> 1 1 5 1
#> 2 2 6 3
#> 3 3 7 4
#>
#> [[2]]
#> id type_category rating
#> 1 9 4 2
#> 2 5 6 7
#> 3 3 1 4
Just note that you would need to assign the result back to a list.
data.table
Since you tagged data.table, this allows you to change the names in place and no extra assignment is necessary
library(data.table)
dt1 <- data.table::data.table(id = c(1,2,3), type_category_lifestyle_characterstics = c(5,6,7), rating = c(1,3,4))
dt2 <- data.table::data.table(id = c(9,5,3), type_category_uniqueness = c(4,6,1), rating = c(2,7,4))
invisible(
lapply(list(dt1, dt2),
function(dt){
nms_old <- names(data.table::copy(dt))
nms_new <- data.table::copy(nms_old)
nms_new[grepl(pattern = "type_category",
x = nms_old,
ignore.case = TRUE)] <- "type_category"
data.table::setnames(dt, old = nms_old, new = nms_new)
return(NULL)
})
)
dt1
#> id type_category rating
#> 1: 1 5 1
#> 2: 2 6 3
#> 3: 3 7 4
dt2
#> id type_category rating
#> 1: 9 4 2
#> 2: 5 6 7
#> 3: 3 1 4

Add together 2 dataframes in R without losing columns

I have 2 dataframes in R (df1, df2).
A C D
1 1 1
2 2 2
df2 as
A B C
1 1 1
2 2 2
How can I merge these 2 dataframes to produce the following output?
A B C D
2 1 2 1
4 2 4 2
Columns are sorted and column values are added. Both DFs have same number of rows. Thank you in advance.
Code to create DF:
df1 <- data.frame("A" = 1:2, "C" = 1:2, "D" = 1:2)
df2 <- data.frame("A" = 1:2, "B" = 1:2, "C" = 1:2)

nm1 = names(df1)
nm2 = names(df2)
nm = intersect(nm1, nm2)
if (length(nm) == 0){ # if no column names in common
cbind(df1, df2)
} else { # if column names in common
cbind(df1[!nm1 %in% nm2], # columns only in df1
df1[nm] + df2[nm], # add columns common to both
df2[!nm2 %in% nm1]) # columns only in df2
}
# D A C B
#1 1 2 2 1
#2 2 4 4 2

You can try:
library(tidyverse)
list(df2, df1) %>%
map(rownames_to_column) %>%
bind_rows %>%
group_by(rowname) %>%
summarise_all(sum, na.rm = TRUE)
# A tibble: 2 x 5
rowname A B C D
<chr> <int> <int> <int> <int>
1 1 2 1 2 1
2 2 4 2 4 2

By using left_join() from dplyr you won't lose the column
library(tidyverse)
dat1 <- tibble(a = 1:10,
b = 1:10,
c = 1:10)
dat2 <- tibble(c = 1:10,
d = 1:10,
e = 1:10)
left_join(dat1, dat2, by = "c")
#> # A tibble: 10 x 5
#> a b c d e
#> <int> <int> <int> <int> <int>
#> 1 1 1 1 1 1
#> 2 2 2 2 2 2
#> 3 3 3 3 3 3
#> 4 4 4 4 4 4
#> 5 5 5 5 5 5
#> 6 6 6 6 6 6
#> 7 7 7 7 7 7
#> 8 8 8 8 8 8
#> 9 9 9 9 9 9
#> 10 10 10 10 10 10
Created on 2019-01-16 by the reprex package (v0.2.1)

allnames <- sort(unique(c(names(df1), names(df2))))
df3 <- data.frame(matrix(0, nrow = nrow(df1), ncol = length(allnames)))
names(df3) <- allnames
df3[,allnames %in% names(df1)] <- df3[,allnames %in% names(df1)] + df1
df3[,allnames %in% names(df2)] <- df3[,allnames %in% names(df2)] + df2
df3
A B C D
1 2 1 2 1
2 4 2 4 2

Here is a fun base R method with Reduce.
Reduce(cbind,
list(Reduce("+", list(df1[intersect(names(df1), names(df2))],
df2[intersect(names(df1), names(df2))])), # sum results
df1[setdiff(names(df1), names(df2))], # in df1, not df2
df2[setdiff(names(df2), names(df1))])) # in df2, not df1
This returns
A C D B
1 2 2 1 1
2 4 4 2 2
This assumes that both df1 and df2 have columns that are not present in the other. If this is not true, you'd have to adjust the list.
Note also that you could replace Reduce with do.call in both places and you'd get the same result.

use value from a col to choose value from another col, put into new df in R

I have a df like this
name <- c("Fred","Mark","Jen","Simon","Ed")
a_or_b <- c("a","a","b","a","b")
abc_ah_one <- c(3,5,2,4,7)
abc_bh_one <- c(5,4,1,9,8)
abc_ah_two <- c(2,1,3,7,6)
abc_bh_two <- c(3,6,8,8,5)
abc_ah_three <- c(5,4,7,6,2)
abc_bh_three <- c(9,7,2,1,4)
def_ah_one <- c(1,3,9,2,7)
def_bh_one <- c(2,8,4,6,1)
def_ah_two <- c(4,7,3,2,5)
def_bh_two <- c(5,2,9,8,3)
def_ah_three <- c(8,5,3,5,2)
def_bh_three <- c(2,7,4,3,0)
df <- data.frame(name,a_or_b,abc_ah_one,abc_bh_one,abc_ah_two,abc_bh_two,
abc_ah_three,abc_bh_three,def_ah_one,def_bh_one,
def_ah_two,def_bh_two,def_ah_three,def_bh_three)
I want to use the value in column "a_or_b" to choose the values in each of the corresponding "ah/bh" columns for each "abc" (one, two, and three), and put it into a new data frame. For example, Fred would have the values 3, 2 and 5 in his row in the new df. Those values represent the values of each of his "ah" categories for the abc columns. Jen, who has "b" in her a_or_b column, would have all of her "bh" values from her abc columns for her row in the new df. Here is what my desired output would look like:
combo_one <- c(3,5,1,4,8)
combo_two <- c(2,1,8,7,5)
combo_three <- c(5,4,2,6,4)
df2 <- data.frame(name,a_or_b,combo_one,combo_two,combo_three)
I've attempted this using sapply. The following gives me a matrix of the correct column correct indexes of df[grep("abc",colnames(df),fixed=TRUE)] for each row:
sapply(paste0(df$a_or_b,"h"),grep,colnames(df[grep("abc",colnames(df),fixed=TRUE)]))

First we gather your data into a tidy long format, then break out the columns into something useful. After that the filtering is simple, and if necessary we can convert back to an difficult wide format:
library(dplyr)
library(tidyr)
gather(df, key = "var", value = "val", -name, -a_or_b) %>%
separate(var, into = c("combo", "h", "ind"), sep = "_") %>%
mutate(h = substr(h, 1, 1)) %>%
filter(a_or_b == h, combo == "abc") %>%
arrange(name) -> result_long
result_long
# name a_or_b combo h ind val
# 1 Ed b abc b one 8
# 2 Ed b abc b two 5
# 3 Ed b abc b three 4
# 4 Fred a abc a one 3
# 5 Fred a abc a two 2
# 6 Fred a abc a three 5
# 7 Jen b abc b one 1
# 8 Jen b abc b two 8
# 9 Jen b abc b three 2
# 10 Mark a abc a one 5
# 11 Mark a abc a two 1
# 12 Mark a abc a three 4
# 13 Simon a abc a one 4
# 14 Simon a abc a two 7
# 15 Simon a abc a three 6
spread(result_long, key = ind, value = val) %>%
select(name, a_or_b, one, two, three)
# name a_or_b one two three
# 1 Ed b 8 5 4
# 2 Fred a 3 2 5
# 3 Jen b 1 8 2
# 4 Mark a 5 1 4
# 5 Simon a 4 7 6

Base R approach would be using lapply, we loop through each row of the dataframe, create a string to find similar columns using paste0 based on a_or_b column and then rbind all the values together for each row.
new_df <- do.call("rbind", lapply(seq(nrow(df)), function(x)
setNames(df[x, grepl(paste0("abc_",df[x,"a_or_b"], "h"), colnames(df))],
c("combo_one", "combo_two", "combo_three"))))
new_df
# combo_one combo_two combo_three
#1 3 2 5
#2 5 1 4
#3 1 8 2
#4 4 7 6
#5 8 5 4
We can cbind the required columns then :
cbind(df[c(1, 2)], new_df)
# name a_or_b combo_one combo_two combo_three
#1 Fred a 3 2 5
#2 Mark a 5 1 4
#3 Jen b 1 8 2
#4 Simon a 4 7 6
#5 Ed b 8 5 4

It's possible to do this with a combination of map and mutate:
require(tidyverse)
df %>%
select(name, a_or_b, starts_with("abc")) %>%
rename_if(is.numeric, funs(sub("abc_", "", .))) %>%
mutate(combo_one = map_chr(a_or_b, ~ paste0(.x,"h_one")),
combo_one = !!combo_one,
combo_two = map_chr(a_or_b, ~ paste0(.x,"h_two")),
combo_two = !!combo_two,
combo_three = map_chr(a_or_b, ~ paste0(.x,"h_three")),
combo_three = !!combo_three) %>%
select(name, a_or_b, starts_with("combo"))
Output:
name a_or_b combo_one combo_two combo_three
1 Fred a 3 2 5
2 Mark a 5 1 4
3 Jen b 1 8 2
4 Simon a 4 7 6
5 Ed b 8 5 4

Repeating rows of data.frame in dplyr [duplicate]

This question already has answers here:
Repeat each row of data.frame the number of times specified in a column
(10 answers)
Closed 2 years ago.
I have a trouble with repeating rows of my real data using dplyr. There is already another post in here repeat-rows-of-a-data-frame but no solution for dplyr.
Here I just wonder how could be the solution for dplyr
but failed with error:
Error: wrong result size (16), expected 4 or 1
library(dplyr)
df <- data.frame(column = letters[1:4])
df_rep <- df%>%
mutate(column=rep(column,each=4))
Expected output
>df_rep
column
#a
#a
#a
#a
#b
#b
#b
#b
#*
#*
#*

Using the uncount function will solve this problem as well. The column count indicates how often a row should be repeated.
library(tidyverse)
df <- tibble(letters = letters[1:4])
df
# A tibble: 4 x 1
letters
<chr>
1 a
2 b
3 c
4 d
df %>%
mutate(count = c(2, 3, 2, 4)) %>%
uncount(count)
# A tibble: 11 x 1
letters
<chr>
1 a
2 a
3 b
4 b
5 b
6 c
7 c
8 d
9 d
10 d
11 d

I was looking for a similar (but slightly different) solution. Posting here in case it's useful to anyone else.
In my case, I needed a more general solution that allows each letter to be repeated an arbitrary number of times. Here's what I came up with:
library(tidyverse)
df <- data.frame(letters = letters[1:4])
df
> df
letters
1 a
2 b
3 c
4 d
Let's say I want 2 A's, 3 B's, 2 C's and 4 D's:
df %>%
mutate(count = c(2, 3, 2, 4)) %>%
group_by(letters) %>%
expand(count = seq(1:count))
# A tibble: 11 x 2
# Groups: letters [4]
letters count
<fctr> <int>
1 a 1
2 a 2
3 b 1
4 b 2
5 b 3
6 c 1
7 c 2
8 d 1
9 d 2
10 d 3
11 d 4
If you don't want to keep the count column:
df %>%
mutate(count = c(2, 3, 2, 4)) %>%
group_by(letters) %>%
expand(count = seq(1:count)) %>%
select(letters)
# A tibble: 11 x 1
# Groups: letters [4]
letters
<fctr>
1 a
2 a
3 b
4 b
5 b
6 c
7 c
8 d
9 d
10 d
11 d
If you want the count to reflect the number of times each letter is repeated:
df %>%
mutate(count = c(2, 3, 2, 4)) %>%
group_by(letters) %>%
expand(count = seq(1:count)) %>%
mutate(count = max(count))
# A tibble: 11 x 2
# Groups: letters [4]
letters count
<fctr> <dbl>
1 a 2
2 a 2
3 b 3
4 b 3
5 b 3
6 c 2
7 c 2
8 d 4
9 d 4
10 d 4
11 d 4

This is rife with peril if the data.frame has other columns (there, I said it!), but the do block will allow you to generate a derived data.frame within a dplyr pipe (though, ceci n'est pas un pipe):
library(dplyr)
df <- data.frame(column = letters[1:4], stringsAsFactors = FALSE)
df %>%
do( data.frame(column = rep(.$column, each = 4), stringsAsFactors = FALSE) )
# column
# 1 a
# 2 a
# 3 a
# 4 a
# 5 b
# 6 b
# 7 b
# 8 b
# 9 c
# 10 c
# 11 c
# 12 c
# 13 d
# 14 d
# 15 d
# 16 d
As #Frank suggested, a much better alternative could be
df %>% slice(rep(1:n(), each=4))

I did a quick benchmark to show that uncount() is a lot faster than expand()
# for the pipe
library(magrittr)
# create some test data
df_test <-
tibble::tibble(
letter = letters,
row_count = sample(1:10, size = 26, replace = TRUE)
)
# benchmark
bench <- microbenchmark::microbenchmark(
expand = df_test %>%
dplyr::group_by(letter) %>%
tidyr::expand(row_count = seq(1:row_count)),
uncount = df_test %>%
tidyr::uncount(row_count)
)
# plot the benchmark
ggplot2::autoplot(bench)

In R, split a dataframe so subset dataframes contain last row of previous dataframe and first row of subsequent dataframe

There are many answers for how to split a dataframe, for example How to split a data frame?
However, I'd like to split a dataframe so that the smaller dataframes contain the last row of the previous dataframe and the first row of the following dataframe.
Here's an example
n <- 1:9
group <- rep(c("a","b","c"), each = 3)
data.frame(n = n, group)
n group
1 1 a
2 2 a
3 3 a
4 4 b
5 5 b
6 6 b
7 7 c
8 8 c
9 9 c
I'd like the output to look like:
d1 <- data.frame(n = 1:4, group = c(rep("a",3),"b"))
d2 <- data.frame(n = 3:7, group = c("a",rep("b",3),"c"))
d3 <- data.frame(n = 6:9, group = c("b",rep("c",3)))
d <- list(d1, d2, d3)
d
[[1]]
n group
1 1 a
2 2 a
3 3 a
4 4 b
[[2]]
n group
1 3 a
2 4 b
3 5 b
4 6 b
5 7 c
[[3]]
n group
1 6 b
2 7 c
3 8 c
4 9 c
What is an efficient way to accomplish this task?

Suppose DF is the original data.frame, the one with columns n and group. Let n be the number of rows in DF. Now define a function extract which given a sequence of indexes ix enlarges it to include the one prior to the first and after the last and then returns those rows of DF. Now that we have defined extract, split the vector 1, ..., n by group and apply extract to each component of the split.
n <- nrow(DF)
extract <- function(ix) DF[seq(max(1, min(ix) - 1), min(n, max(ix) + 1)), ]
lapply(split(seq_len(n), DF$group), extract)
$a
n group
1 1 a
2 2 a
3 3 a
4 4 b
$b
n group
3 3 a
4 4 b
5 5 b
6 6 b
7 7 c
$c
n group
6 6 b
7 7 c
8 8 c
9 9 c

Or why not try good'ol by, which "[a]ppl[ies] a Function to a Data Frame Split by Factors [INDICES]".
by(data = df, INDICES = df$group, function(x){
id <- c(min(x$n) - 1, x$n, max(x$n) + 1)
na.omit(df[id, ])
})
# df$group: a
# n group
# 1 1 a
# 2 2 a
# 3 3 a
# 4 4 b
# --------------------------------------------------------------------------------
# df$group: b
# n group
# 3 3 a
# 4 4 b
# 5 5 b
# 6 6 b
# 7 7 c
# --------------------------------------------------------------------------------
# df$group: c
# n group
# 6 6 b
# 7 7 c
# 8 8 c
# 9 9 c
Although the print method of by creates a 'fancy' output, the (default) result is a list, with elements named by the levels of the grouping variable (just try str and names on the resulting object).

I was going to comment under #cdetermans answer but its too late now.
You can generalize his approach using data.table::shift (or dyplr::lag) in order to find the group indices and then run a simple lapply on the ranges, something like
library(data.table) # v1.9.6+
indx <- setDT(df)[, which(group != shift(group, fill = TRUE))]
lapply(Map(`:`, c(1L, indx - 1L), c(indx, nrow(df))), function(x) df[x,])
# [[1]]
# n group
# 1: 1 a
# 2: 2 a
# 3: 3 a
# 4: 4 b
#
# [[2]]
# n group
# 1: 3 a
# 2: 4 b
# 3: 5 b
# 4: 6 b
# 5: 7 c
#
# [[3]]
# n group
# 1: 6 b
# 2: 7 c
# 3: 8 c
# 4: 9 c

Could be done with data.frame as well, but is there ever a reason not to use data.table? Also this has the option to be executed with parallelism.
library(data.table)
n <- 1:9
group <- rep(c("a","b","c"), each = 3)
df <- data.table(n = n, group)
df[, `:=` (group = factor(df$group))]
df[, `:=` (group_i = seq_len(.N), group_N = .N), by = "group"]
library(doParallel)
groups <- unique(df$group)
foreach(i = seq(groups)) %do% {
df[group == groups[i] | (as.integer(group) == i + 1 & group_i == 1) | (as.integer(group) == i - 1 & group_i == group_N), c("n", "group"), with = FALSE]
}
[[1]]
n group
1: 1 a
2: 2 a
3: 3 a
4: 4 b
[[2]]
n group
1: 3 a
2: 4 b
3: 5 b
4: 6 b
5: 7 c
[[3]]
n group
1: 6 b
2: 7 c
3: 8 c
4: 9 c

Here is another dplyr way:
library(dplyr)
data =
data_frame(n = n, group) %>%
group_by(group)
firsts =
data %>%
slice(1) %>%
ungroup %>%
mutate(new_group = lag(group)) %>%
slice(-1)
lasts =
data %>%
slice(n()) %>%
ungroup %>%
mutate(new_group = lead(group)) %>%
slice(-n())
bind_rows(firsts, data, lasts) %>%
mutate(final_group =
ifelse(is.na(new_group),
group,
new_group) ) %>%
arrange(final_group, n) %>%
group_by(final_group)

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Count unique values by column and group of rows - r

I have this example: df.Journal.Conferences venue author0 author1 author2 ... author19 A John Mary B Peter Jacob Isabella C Lia B Jacob Lara John C Mary B Isabella I want to know how many unique authors are in each venue Result: A 2 B 5 C 2 Edit: Here is the link to my data: GoogleDrive Excel sheet.

Using #zx8754 data for testing, this code gives want you wanted (assuming you have NA for empty cells in the dataframe): sapply(split(df1[,-1], df1$venue), function(x) length(unique(x[!is.na(x)]))) # A B C # 2 5 2

Related

Renaming a variable through a loop

Add together 2 dataframes in R without losing columns

use value from a col to choose value from another col, put into new df in R

Repeating rows of data.frame in dplyr [duplicate]

In R, split a dataframe so subset dataframes contain last row of previous dataframe and first row of subsequent dataframe

Categories

Resources