Combine groups within dataframe if they share at least one common item - r

In my dataframe, I have a 'Groups' column and a 'Person' column. I want to join groups together if they share at least one common person. Consider the following example data:
Group Person
1 David
1 Sarah
1 John
2 Brian
2 Andrew
3 David
3 Charlie
4 Clare
4 Greg
5 Greg
5 Clare
5 Alan
In this example, Group 1 and Group 3 share a common person - David. The people in Group 2 do not overlap with the people in any other group. Group 4 and Group 5 share two common people Clare and Greg.
My desired output would be as follows:
Group Person
1 David
1 Sarah
1 John
1 Charlie
2 Brian
2 Andrew
3 Clare
3 Greg
3 Alan
Reproducible data:
structure(list(Group = c(1, 1, 1, 2, 2, 3, 3, 4, 5, 5), Person = c("David",
"Sarah", "John", "Brian", "Andrew", "David", "Charlie", "Clare",
"Greg", "Clare")), class = c("spec_tbl_df", "tbl_df", "tbl",
"data.frame"), row.names = c(NA, -10L), spec = structure(list(
cols = list(Group = structure(list(), class = c("collector_double",
"collector")), Person = structure(list(), class = c("collector_character",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), class = "col_spec"))

Using igraph cluster membership:
library(igraph)
#convert to graph object
g <- graph_from_data_frame(df1)
#get cluster memberships
x <- clusters(g)$membership
x
# 1 2 3 4 5 David Sarah John Brian Andrew Charlie Clare Greg
# 1 2 1 3 3 1 1 1 2 2 1 3 3
# assign membership back to dataframe
df1$membership <- x[ df1$Person ]
df1
# Group Person membership
# 1 1 David 1
# 2 1 Sarah 1
# 3 1 John 1
# 4 2 Brian 2
# 5 2 Andrew 2
# 6 3 David 1
# 7 3 Charlie 1
# 8 4 Clare 3
# 9 5 Greg 3
# 10 5 Clare 3
We can use unique to avoid duplicated rows, and sort:
unique(df1[order(df1$membership), -1 ])
# Person membership
# 1 David 1
# 2 Sarah 1
# 3 John 1
# 7 Charlie 1
# 4 Brian 2
# 5 Andrew 2
# 8 Clare 3
# 9 Greg 3

This first groups by Person and gets their list of Groups grouped_groups.
Then it groups by Group and creates a new character variable, new_grouping which is the union of each list of groups within each Group. All using tidyverse verbs.
DF %>%
group_by(Person) %>%
mutate(grouped_groups = list(Group)) %>%
group_by(Group) %>%
mutate(new_grouping = paste0(list(sort(reduce(groups, union))), collapse = "-"))

Related

Find rows that are identical in one column but not another

There should be a fairly simple solution to this but it's giving me trouble. I have a DF similar to this:
> df <- data.frame(name = c("george", "george", "george", "sara", "sara", "sam", "bill", "bill"),
id_num = c(1, 1, 2, 3, 3, 4, 5, 5))
> df
name id_num
1 george 1
2 george 1
3 george 2
4 sara 3
5 sara 3
6 sam 4
7 bill 5
8 bill 5
I'm looking for a way to find rows where the name and ID numbers are inconsistent in a very large dataset. I.e., George should always be "1" but in row three there is a mistake and he has also been assigned ID number "2".
I think the easiest way will be to use dplyr::count twice, hence for your example:
df %>%
count(name, id) %>%
count(name)
The first count will give:
name id n
george 1 2
george 2 1
sara 3 2
sam 4 1
bill 5 2
Then the second count will give:
name n
george 2
sara 1
sam 1
bill 1
Of course, you could add filter(n > 1) to the end of your pipe, too, or arrange(desc(n))
df %>%
count(name, id) %>%
count(name) %>%
arrange(desc(n)) %>%
filter(n > 1)
Using tapply() to calculate number of ID's per name, then subset for greater than 1.
res <- with(df, tapply(id_num, list(name), \(x) length(unique(x))))
res[res > 1]
# george
# 2
You probably want to correct this. A safe way is to rebuild the numeric ID's using as.factor(),
df$id_new <- as.integer(as.factor(df$name))
df
# name id_num id_new
# 1 george 1 2
# 2 george 1 2
# 3 george 2 2
# 4 sara 3 4
# 5 sara 3 4
# 6 sam 4 3
# 7 bill 5 1
# 8 bill 5 1
where numbers are assigned according to the names in alphabetical order, or factor(), reading in the levels in order of appearance.
df$id_new2 <- as.integer(factor(df$name, levels=unique(df$name)))
df
# name id_num id_new id_new2
# 1 george 1 2 1
# 2 george 1 2 1
# 3 george 2 2 1
# 4 sara 3 4 2
# 5 sara 3 4 2
# 6 sam 4 3 3
# 7 bill 5 1 4
# 8 bill 5 1 4
Note: R >= 4.1 used.
Data:
df <- structure(list(name = c("george", "george", "george", "sara",
"sara", "sam", "bill", "bill"), id_num = c(1, 1, 2, 3, 3, 4,
5, 5)), class = "data.frame", row.names = c(NA, -8L))

How to sort the values of each obs of a data.frame? [duplicate]

This question already has answers here:
Row wise Sorting in R
(2 answers)
Closed 3 years ago.
I have this data.set
people <- c("Arthur", "Jean", "Paul", "Fred", "Gary")
question1 <- c(1, 3, 2, 2, 5)
question2 <- c(1, 0, 1, 0, 3)
question3<- c(1, 0, 2, 2, 4)
question4 <- c(1, 5, 2, 1, 5)
test <- data.frame(people, question1, question2, question3, question4)
test
Here is my output :
people question1 question2 question3 question4
1 Arthur 1 1 1 1
2 Jean 3 0 0 5
3 Paul 2 1 2 2
4 Fred 2 0 2 1
5 Gary 5 3 4 5
I want to order the results of each people like this (descending order based on values from left to right columns) in a new data.frame. Ne names of the new columns are letters or anything else.
people A B C D
1 Arthur 1 1 1 1
2 Jean 5 3 0 0
3 Paul 2 2 2 1
4 Fred 2 2 2 0
5 Gary 5 5 4 3
With base R apply function sort to the rows in question but be carefull, apply returns the transpose:
test[-1] <- t(apply(test[-1], 1, sort, decreasing = TRUE))
test
# people question1 question2 question3 question4
#1 Arthur 1 1 1 1
#2 Jean 5 3 0 0
#3 Paul 2 2 2 1
#4 Fred 2 2 1 0
#5 Gary 5 5 4 3
Solution using tidyverse (i.e. dplyr and tidyr):
library(tidyverse)
test %>%
pivot_longer(cols=-people, names_to="variable",values_to = "values") %>%
arrange(people, -values) %>%
select(people, values) %>%
mutate(new_names = rep(letters[1:4], length(unique(test$people)))) %>%
pivot_wider(names_from = new_names,
values_from = values)
This returns:
# A tibble: 5 x 5
people a b c d
<fct> <dbl> <dbl> <dbl> <dbl>
1 Arthur 1 1 1 1
2 Fred 2 2 1 0
3 Gary 5 5 4 3
4 Jean 5 3 0 0
5 Paul 2 2 2 1
Explanation:
bring data into 'long' form so we can order it on the values of all the 'question' variables.
order (arrange) on people and -values (see above)
remove the not used variable variable
create a new column to hold the new names, name them A-D, for each value of person
bring the data into 'wide' form, creating new columns from the new names
One dplyr and tidyr option could be:
test %>%
pivot_longer(-people) %>%
group_by(people) %>%
arrange(desc(value), .by_group = TRUE) %>%
mutate(name = LETTERS[1:n()]) %>%
pivot_wider(names_from = "name", values_from = "value")
people A B C D
<fct> <dbl> <dbl> <dbl> <dbl>
1 Arthur 1 1 1 1
2 Fred 2 2 1 0
3 Gary 5 5 4 3
4 Jean 5 3 0 0
5 Paul 2 2 2 1

Summarizing dataframe based on multiple columns

I'm having some trouble figuring this one out. Say, I have a table like this:
Name Activity Day
1 John cycle 1
2 John work 1
3 Tina work 1
4 Monika work 1
5 Tina swim 1
6 Tina jogging 2
7 John work 2
8 Tina work 2
I want to summarize it in a way that the activity of each individual is grouped according to the day.
It should look like this:
Name Activity Day
1 John cycle;work 1
2 Tina work;swim 1
3 Monika work 1
4 Tina jogging;work 2
5 John work 2
I am thinking that dplyr package would be the answer here, but I don't know how to do it. Any help?
Thanks!
try:
library(dplyr)
dat <- tribble(~"Name", ~"Activity", ~"Day",
"John", "cycle", 1,
"John", "work" , 1,
"Tina", "work", 1,
"Monika", "work", 1,
"Tina", "swim", 1,
"Tina", "jogging", 2,
"John", "work", 2,
"Tina", "work", 2)
dat %>%
group_by(Name, Day) %>%
summarise(activity = paste(Activity, collapse = "; "))
# A tibble: 5 x 3
# Groups: Name [3]
Name Day activity
<chr> <dbl> <chr>
1 John 1 cycle; work
2 John 2 work
3 Monika 1 work
4 Tina 1 work; swim
5 Tina 2 jogging; work
An option with data.table
library(data.table)
setDT(dat)[, .(Activity = toString(Activity)), .(Name, Day)]
You can use the aggregate function, for example:
> aggregate(dat$Activity,list(dat$Name,dat$Day),as.character)
Group.1 Group.2 x
1 John 1 cycle, work
2 Monika 1 work
3 Tina 1 work, swim
4 John 2 work
5 Tina 2 jogging, work

Expanding a list to include all possible pairwise combinations within a group

I am currently running a randomization where individuals of a given population are sampled and placed into groups of defined size. The result is a data frame seen below:
Ind Group
Sally 1
Bob 1
Sue 1
Joe 2
Jeff 2
Jess 2
Mary 2
Jim 3
James 3
Is there a function which will allow me to expand the data set to show every possible within group pairing? (Desired output below). The pairings do not need to be reciprocal.
Group Ind1 Ind2
1 Sally Bob
1 Sally Sue
1 Sue Bob
2 Joe Jeff
2 Joe Jess
2 Joe Mary
2 Jeff Jess
2 Jess Mary
2 Jeff Mary
3 Jim James
I feel like there must be a way to do this in dplyr, but for the life of me I can't seem to sort it out.
An alternative dplyr & tidyr approach: The pipeline is a little longer, but the wrangling feels more straightforward to me. Start with combining all records in each group together. Next, pool and alphabetize all the names together to be able to eliminate the reciprocal/duplicates. Then finally separate the results back apart again.
left_join(dt, dt, by = "Group") %>%
filter(Ind.x != Ind.y) %>%
rowwise %>%
mutate(name = toString(sort(c(Ind.x,Ind.y)))) %>%
select(Group, name) %>%
distinct %>%
separate(name, into = c("Ind1", "Ind2")) %>%
arrange(Group, Ind1, Ind2)
start off with a weak cross join of all records in each group
filter out the self joins
collect up all the names in each row, sort them, and set them down together in the name column.
now that the names are alphabetized, remove the alphabetized reciprocals
pull the data apart back into separate columns.
# A tibble: 10 x 3
Group Ind1 Ind2
* <int> <chr> <chr>
1 1 Bob Sally
2 1 Sally Sue
3 1 Bob Sue
4 2 Jeff Joe
5 2 Jess Joe
6 2 Joe Mary
7 2 Jeff Jess
8 2 Jeff Mary
9 2 Jess Mary
10 3 James Jim
Here is an option using data.table. Convert to data.table (setDT(dt)), Do a cross join (CJ) grouped by 'Group' and remove the duplicated elements
library(data.table)
setDT(dt)[, CJ(Ind1 = Ind, Ind2 = Ind, unique = TRUE)[Ind1 != Ind2],
Group][!duplicated(data.table(pmax(Ind1, Ind2), pmin(Ind1, Ind2)))]
# Group Ind1 Ind2
#1: 1 Bob Sally
#2: 1 Bob Sue
#3: 1 Sally Sue
#4: 2 Jeff Jess
#5: 2 Jeff Joe
#6: 2 Jeff Mary
#7: 2 Jess Joe
#8: 2 Jess Mary
#9: 2 Joe Mary
#10: 3 James Jim
Or using combn by 'Group'
setDT(dt)[, {temp <- combn(Ind, 2); .(Ind1 = temp[1,], Ind2 = temp[2,])}, Group]
A solution using dplyr. We can use group_by and do to apply the combn function to each group and combine the results to form a data frame.
library(dplyr)
dt2 <- dt %>%
group_by(Group) %>%
do(as_data_frame(t(combn(.$Ind, m = 2)))) %>%
ungroup() %>%
setNames(sub("V", "Ind", colnames(.)))
dt2
# # A tibble: 10 x 3
# Group Ind1 Ind2
# <int> <chr> <chr>
# 1 1 Sally Bob
# 2 1 Sally Sue
# 3 1 Bob Sue
# 4 2 Joe Jeff
# 5 2 Joe Jess
# 6 2 Joe Mary
# 7 2 Jeff Jess
# 8 2 Jeff Mary
# 9 2 Jess Mary
# 10 3 Jim James
DATA
dt <- read.table(text = "Ind Group
Sally 1
Bob 1
Sue 1
Joe 2
Jeff 2
Jess 2
Mary 2
Jim 3
James 3",
header = TRUE, stringsAsFactors = FALSE)

Create a data frame from list values

I have the following list:
peter <- data.frame(year = 1:5, a = rnorm(5), b = rnorm(5))
john <- data.frame(year = 1:5, a = rnorm(5), b = rnorm(5))
myList <- list(peter, john)
names(myList) <- c("peter", "john")
myList
$peter
year a b
1 1.01464245 0.2490931
2 1.38054309 0.8396630
3 -0.84094830 0.2410526
4 -0.05567379 0.6369121
5 -0.66412862 1.5739672
$john
year a b
1 0.3060996 -0.4256702
2 0.7167710 -0.6828029
3 -0.6896138 0.6577422
4 -1.7647412 -0.5651756
5 0.3065734 -0.4860141
How can I transform myList into the following data frame:
year student a b
1 1 peter 1.01464245 0.2490931
2 2 peter 1.38054309 0.8396630
3 3 peter -0.84094830 0.2410526
4 4 peter -0.05567379 0.6369121
5 5 peter -0.66412862 1.5739672
6 1 john 0.30609964 -0.4256702
7 2 john 0.71677097 -0.6828029
8 3 john -0.68961377 0.6577422
9 4 john -1.76474117 -0.5651756
10 5 john 0.30657340 -0.4860141
Thank you very much.
library(plyr)
dat <- ldply(myList)
colnames(dat) <- c("student", "year", "a", "b")
print(dat)
## student year a b
## 1 peter 1 0.03716519 0.8465317
## 2 peter 2 -1.15449127 1.5461944
## 3 peter 3 0.15933780 0.7468312
## 4 peter 4 0.91745104 0.1113958
## 5 peter 5 -0.22924789 -0.5344617
## 6 john 1 0.40790134 0.5886599
## 7 john 2 -0.88635369 -0.3596063
## 8 john 3 -1.16444277 1.1080161
## 9 john 4 -0.19082412 0.1675609
## 10 john 5 1.19066829 -0.8855810
Another alternative (very similar to Ben's)
> df <- do.call(rbind, myList)
> df <- transform(df, student=sub("\\.[0-9]", "", rownames(df)))[, c("year", "student", "a", "b")]
> rownames(df)<- NULL
> df
year student a b
1 1 peter -0.71040656 -0.04502772
2 2 peter 0.25688371 -0.78490447
3 3 peter -0.24669188 -1.66794194
4 4 peter -0.34754260 -0.38022652
5 5 peter -0.95161857 0.91899661
6 1 john -0.57534696 0.30115336
7 2 john 0.60796432 0.10567619
8 3 john -1.61788271 -0.64070601
9 4 john -0.05556197 -0.84970435
10 5 john 0.51940720 -1.02412879
Similar to Ben's, only a bit different.
dd <- do.call(rbind, myList)
cbind(dd[1], student = sub("[.].*", "", rownames(dd)), dd[2:3], row.names = NULL)
# year student a b
# 1 1 peter -1.66983899 0.3683629
# 2 2 peter 0.25391016 -0.4999335
# 3 3 peter -0.19102468 -0.9344484
# 4 4 peter 1.72821089 -2.6148841
# 5 5 peter 0.30320439 -0.2602509
# 6 1 john -0.02447092 -0.2396401
# 7 2 john -1.57022813 1.1159078
# 8 3 john 2.82545689 0.6818537
# 9 4 john -0.11273218 -1.8000738
# 10 5 john -1.39706920 0.1647720
Update Sept 16, 2015 An improvement on my previous answer:
f <- function(x, y) cbind(x[1], student = y, x[-1])
do.call(rbind, Map(f, myList, names(myList), USE.NAMES = FALSE))
A bit unwieldy, but:
peter <- data.frame(year = 1:5, a = rnorm(5), b = rnorm(5))
john <- data.frame(year = 1:5, a = rnorm(5), b = rnorm(5))
myList <- list(peter=peter, john=john)
do.call(rbind,
mapply(function(student,d) { data.frame(student,d) },
names(myList),myList,SIMPLIFY=FALSE))
define a function that creates a data frame from the student name and the student information;
pass it to mapply() (with SIMPLIFY=FALSE) to get a list of augmented data frames;
do.call(rbind,...) to combine the pieces.
Alternatively:
info <- do.call(rbind,myList)
student <- rep(names(myList),sapply(myList,nrow))
data.frame(student,info)

Resources