get consensus of multiple partitioning methods in R - r

My data:
data=cbind(c(1,1,2,1,1,3),c(1,1,2,1,1,1),c(2,2,1,2,1,2))
colnames(data)=paste("item",1:3)
rownames(data)=paste("method",1:6)
I want as an output that according to majority vote, there are two communities (with their elements). Something like: group1={item1, item2}, group2={item3}.

You can try this, base R:
res=apply(data,2,function(u) as.numeric(names(sort(table(u), decreasing=T))[1]))
setNames(lapply(unique(res), function(u) names(res)[res==u]), unique(res))
#$`1`
#[1] "item 1" "item 2"
#$`2`
#[1] "item 3"

This function is passed a matrix where each column is an item and each row is a membership vector corresponding to a partition of the items according to a clustering method. The elements (numbers) composing each row have no meaning other than indicating membership and are recycled from row to row. The function returns the majority vote partition. When no consensus exists for an item, the partition given by the first row wins. This allows ordering of the partitions by decreasing values of modularity, for instance.
consensus.final <-
function(data){
output=list()
for (i in 1:nrow(data)){
row=as.numeric(data[i,])
output.inner=list()
for (j in 1:length(row)){
group=character()
group=c(group,colnames(data)[which(row==row[j])])
output.inner[[j]]=group
}
output.inner=unique(output.inner)
output[[i]]=output.inner
}
# gives the mode of the vector representing the number of groups found by each method
consensus.n.comm=as.numeric(names(sort(table(unlist(lapply(output,length))),decreasing=TRUE))[1])
# removes the elements of the list that do not correspond to this consensus solution
output=output[lapply(output,length)==consensus.n.comm]
# 1) find intersection
# 2) use majority vote for elements of each vector that are not part of the intersection
group=list()
for (i in 1:consensus.n.comm){
list.intersection=list()
for (p in 1:length(output)){
list.intersection[[p]]=unlist(output[[p]][i])
}
# candidate group i
intersection=Reduce(intersect,list.intersection)
group[[i]]=intersection
# we need to reinforce that group
for (p in 1:length(list.intersection)){
vector=setdiff(list.intersection[[p]],intersection)
if (length(vector)>0){
for (j in 1:length(vector)){
counter=vector(length=length(list.intersection))
for (k in 1:length(list.intersection)){
counter[k]=vector[j]%in%list.intersection[[k]]
}
if(length(which(counter==TRUE))>=ceiling((length(counter)/2)+0.001)){
group[[i]]=c(group[[i]],vector[j])
}
}
}
}
}
group=lapply(group,unique)
# variables for which consensus has not been reached
unclassified=setdiff(colnames(data),unlist(group))
if (length(unclassified)>0){
for (pp in 1:length(unclassified)){
temp=matrix(nrow=length(output),ncol=consensus.n.comm)
for (i in 1:nrow(temp)){
for (j in 1:ncol(temp)){
temp[i,j]=unclassified[pp]%in%unlist(output[[i]][j])
}
}
# use the partition of the first method when no majority exists (this allows ordering of partitions by decreasing modularity values for instance)
index.best=which(temp[1,]==TRUE)
group[[index.best]]=c(group[[index.best]],unclassified[pp])
}
}
output=list(group=group,unclassified=unclassified)
}
Some examples:
data=cbind(c(1,1,2,1,1,3),c(1,1,2,1,1,1),c(2,2,1,2,1,2))
colnames(data)=paste("item",1:3)
rownames(data)=paste("method",1:6)
data
consensus.final(data)$group
[[1]]
[1] "item 1" "item 2"
[[2]]
[1] "item 3"
data=cbind(c(1,1,1,1,1,3),c(1,1,1,1,1,1),c(1,1,1,2,1,2))
colnames(data)=paste("item",1:3)
rownames(data)=paste("method",1:6)
data
consensus.final(data)$group
[[1]]
[1] "item 1" "item 2" "item 3"
data=cbind(c(1,3,2,1),c(2,2,3,3),c(3,1,1,2))
colnames(data)=paste("item",1:3)
rownames(data)=paste("method",1:4)
data
consensus.final(data)$group
[[1]]
[1] "item 1"
[[2]]
[1] "item 2"
[[3]]
[1] "item 3"

Related

Sequence of numbers by hyphen without hyphenating single occurrences

I want to generate readable number sequences (e.g. 1, 2, 3, 4 = 1-4), but for a set of data where each number in the sequence must have four digits (e.g. 99 = 0099 or 1 = 0001 or 1022 = 1022) AND where there are different letters in front of each number.
I was looking at the answer to this question, which managed to do almost exactly as I want with two caveats:
If there is a stand-alone number that does not appear in a sequence, it will appear twice with a hyphen in between
If there are several stand-alone numbers that do no appear in a sequence, they won't be included in the result
### Create Data Set ====
## Create the data for different tags. I'm only using two unique levels here, but in my dataset I've got
## 400+ unique levels.
FM <- paste0('FM', c('0001', '0016', '0017', '0018', '0019', '0021', '0024', '0026', '0028'))
SC <- paste0('SC', c('0002', '0003', '0004', '0010', '0012', '0014', '0033', '0036', '0039'))
## Combine data
my.seq1 <- c(FM, SC)
## Sort data by number in sequence
my.seq1 <- my.seq1[order(substr(my.seq1, 3, 7))]
### Attempt Number Sequencing ====
## Get the letters
sp.tags <- substr(my.seq1, 1, 2)
## Get the readable number sequence
lapply(split(my.seq1, sp.tags), ## Split data by the tag ID
function(x){
## Get the run lengths as per [previous answer][1]
rl <- rle(c(1, pmin(diff(as.numeric(substr(x, 3, 7))), 2)))
## Generate number sequence by separator as per [previous answer][1]
seq2 <- paste0(x[c(1, cumsum(rl$lengths))], c("-", ",")[rl$values], collapse="")
return(substr(seq2, 1, nchar(seq2)-1))
})
## Combine lists and sort elements
my.seq2 <- unlist(strsplit(do.call(c, my.seq2), ","))
my.seq2 <- my.seq2[order(substr(my.seq2, 3, 7))]
names(my.seq2) <- NULL
my.seq2
[1] "FM0001-FM0001" "SC0002-SC0004" "FM0016-FM0019" "FM0028" "SC0039"
my.seq1
[1] "FM0001" "SC0002" "SC0003" "SC0004" "SC0010" "SC0012" "SC0014" "FM0016" "FM0017" "FM0018" "FM0019" "FM0021"
[13] "FM0024" "FM0026" "FM0028" "SC0033" "SC0036" "SC0039"
The major problems with this are:
Some values are completely missing from the data set (e.g. FM0021, FM0024, FM0026)
The first number in the sequence (FM0001) appears with a hyphen in between
I feel like I'm getting warmer by using A5C1D2H2I1M1N2O1R2T1's answer to utilize seqToHumanReadable because it's quite elegant AND solves both problems. Two more problems are that I'm not able to tag the ID before each number and can't force the number of digits to four (e.g. 0004 becomes 4).
library(R.utils)
lapply(split(my.seq1, sp.tags), function(x){
return(unlist(strsplit(seqToHumanReadable(substr(x, 3, 7)), ',')))
})
$FM
[1] "1" " 16-19" " 21" " 24" " 26" " 28"
$SC
[1] "2-4" " 10" " 12" " 14" " 33" " 36" " 39"
Ideally the result would be:
"FM0001, SC002-SC004, SC0012, SC0014, FM0017-FM0019, FM0021, FM0024, FM0026, FM0028, SC0033, SC0036, SC0039"
Any ideas? It's one of those things that's really simple to do by hand but would take blinking ages, and you'd think a function would exist for it but I haven't found it yet or it doesn't exist :(
This should do?
# get the prefix/tag and number
tag <- gsub("(^[A-z]+)(.+)", "\\1", my.seq1)
num <- gsub("([A-z]+)(\\d+$)", "\\2", my.seq1)
# get a sequence id
n <- length(tag)
do_match <- c(FALSE, diff(as.numeric(num)) == 1 & tag[-1] == tag[-n])
seq_id <- cumsum(!do_match) # a sequence id
# tapply to combine the result
res <- setNames(tapply(my.seq1, seq_id, function(x)
if(length(x) < 2)
return(x)
else
paste(x[1], x[length(x)], sep = "-")), NULL)
# show the result
res
#R> [1] "FM0001" "SC0002-SC0004" "SC0010" "SC0012" "SC0014" "FM0016-FM0019" "FM0021"
#R> [8] "FM0024" "FM0026" "FM0028" "SC0033" "SC0036" "SC0039"
# compare with
my.seq1
#R> [1] "FM0001" "SC0002" "SC0003" "SC0004" "SC0010" "SC0012" "SC0014" "FM0016" "FM0017" "FM0018" "FM0019" "FM0021" "FM0024"
#R> [14] "FM0026" "FM0028" "SC0033" "SC0036" "SC0039"
Data
FM <- paste0('FM', c('0001', '0016', '0017', '0018', '0019', '0021', '0024', '0026', '0028'))
SC <- paste0('SC', c('0002', '0003', '0004', '0010', '0012', '0014', '0033', '0036', '0039'))
my.seq1 <- c(FM, SC)
my.seq1 <- my.seq1[order(substr(my.seq1, 3, 7))]

Partial Match String and full replacement over multiple vectors

Would like to efficiently replace all partial match strings over a single column by supplying a vector of strings which will be searched (and matched) and also be used as replacement. i.e. for each vector in df below, it will partially match for vectors in vec_string. Where matches is found, it will simply replace the entire string with vec_string. i.e. turning 'subscriber manager' to 'manager'. By supplying more vectors into vec_string, it will search through the whole df until all is complete.
I have started the function, but can't seem to finish it off by replacing the vectors in df with vec_string. Appreciate your help
df <- c(
'solicitor'
,'subscriber manager'
,'licensed conveyancer'
,'paralegal'
,'property assistant'
,'secretary'
,'conveyancing paralegal'
,'licensee'
,'conveyancer'
,'principal'
,'assistant'
,'senior conveyancer'
,'law clerk'
,'lawyer'
,'legal practice director'
,'legal secretary'
,'personal assistant'
,'legal assistant'
,'conveyancing clerk')
vec_string <- c('manager','law')
#function to search and replace
replace_func <-
function(vec,str_vec) {
repl_str <- list()
for(i in 1:length(str_vec)) {
repl_str[[i]] <- grep(str_vec[i],unique(tolower(vec)))
}
names(repl_str) <- vec_string
return(repl_str)
}
replace_func(df,vec_string)
$`manager`
[1] 2
$law
[1] 13 14
As you can see, the function returns a named list with elements to which the replacement will
This should do the trick
res = sapply(df,function(x){
match = which(sapply(vec_string,function(y) grepl(y,x)))
if (length(match)){x=vec_string[match[1]]}else{x}
})
res
[1] "solicitor" "manager" "licensed conveyancer"
[4] "paralegal" "property assistant" "secretary"
[7] "conveyancing paralegal" "licensee" "conveyancer"
[10] "principal" "assistant" "senior conveyancer"
[13] "law" "law" "legal practice director"
[16] "legal secretary" "personal assistant" "legal assistant"
[19] "conveyancing clerk"
We compare each part of df with each part of vec_string. If there is a match, the vec_string part is returned, else it is left as it is. Watch out as if there are more than 1 matches it will keep the first one.

Output of binaryRatingMatrix in r

I have created a matrix in Excel, of customers and items that have been purchased, by the customers. The column names are "Item 1", "Item 2" ... "Item n" and the row names are "Customer 1", "Customer 2", ... ,"Customer n".
The code is as follows:
library(recommenderlab)
setwd("C:\\Users\\amitai\\Desktop\\se")
USERBASE <- read.csv("USERBASE.csv")
USERBASE2 <- as(USERBASE,"binaryRatingMatrix")
rec <- Recommender(USERBASE2, method = "UBCF")
recommended.items.customer1 <- predict(rec, USERBASE2["customer 1",], n=5)
as(recommended.items.customer1, "list")
I expected to get a list of 5 items, which are most recommended to customer 1.
Instead, I got this output:
$`customer 1`
[1] "1"
After running the same code for customer 103, I got a similar output:
$`customer 103`
[1] "1"
My questions are as follows:
Is there a problem with the code I wrote?
Is there another action I should take after reading the CSV file?
How is the correct output supposed to look like?

how to sort out a nested list in R

The original data was a simple list named "data" like this
[1] "score: 10 / review 1 / ID 1
[2] "score: 9 / review 2 / ID 2
[3] "score: 8 / review 3 / ID 3
----
[30] "score: 7 / review 30 / ID&DATE: 30
In order to sort out scores reviews and ID&DATEs separately,
I first made it a matrix, and then split them by "/" using str_split "stringr"
so the whole process went like this.
a1 <- readLines("data.txt")
a2 <- t(a1) # Matrix
a3 <- t(a2) # reversing rows and columns
b1 <- str_split(a,"/")
here is the problem
b1 came out as a nested list like this.
[[1]]
[1] "score: 10"
[2] "review 1"
[3] "ID 1"
[[2]]
[1] "score: 9"
[2] "review 2"
[3] "ID 2"
[[3]]
[1] "score: 8"
[2] "review 3"
[3] "ID 3"
------
[[30]]
[1] "score: 7"
[2] "review 30"
[3] "ID 30"
I want to extract the values of [[1]][1], [[2]][1], [[3]][1], ... [[30]][1], [[n]][2], and [[n]][3] SEPARATELY, and make each one of them a dataframe.
Any clues?
The following would work for a particular type of nested list that looks like your data. Without a reproducible example, I don't know for sure:
# create nested list
temp <- list(a=c(list("score: 10"), "review 1", "ID 1"),
b=c("score: 9", "review 2", "ID 2"),
c=c("score: 8", "review 3","ID 3"))
# create data frame from this list
df <- data.frame(score=unlist(sapply(temp, function(i) i[1])),
review=unlist(sapply(temp, function(i) i[2])),
ID=unlist(sapply(temp, function(i) i[3])))
I use sapply to pull out elements from each list item. Then, unlist is applied to the output so that it becomes a vector. All of this out put is wrapped in a data.frame. Note that you can rearrange the output so that the variables are arranged differently.
An even cleaner method, mentioned by #parfait, uses do.call and rbind:
# construct data.frame, rbinding each list item
df <- data.frame(do.call(rbind, temp))
# add the desired names
names(df) <- c('score', 'review', 'ID')

fuse some information in a vector

Something maybe obvious but I can't seem to see it :
I have a vector like this :
vec<-c("i: 1","n: alpha","a: term1","a: term2", "i: 2","n: beta","a: term3","i: 3","n: gamma","a: term4","a: term5","a: term6")
and I need to get this :
out<-c("i: 1","n: alpha","a: term1;term2", "i: 2","n: beta","a: term3","i: 3","n: gamma","a: term4;term5;term6")
That is, for each unique i:, fuse the a: when there are more than one.
I tried with diff and rle but the resulted code (see below) is too long and I think I'm complicating uselessly the problem...
my code :
out<-vec
a<-which(grepl("^a: ",vec))
diffa<-diff(a)
diffa1<-which(diffa==1)
rle_a<-rle(diffa)$lengths[rle(diffa)$values==1]
indwh<-1
for(ind in 1:length(rle_a)){
allindwh<-indwh:(indwh+rle_a[ind]-1)
out[a[c(diffa1[allindwh],diffa1[allindwh[length(allindwh)]]+1)]]<-paste(out[a[diffa1[allindwh[1]]]],paste(gsub("a: ","",out[a[c(diffa1[allindwh[-1]],diffa1[allindwh[length(allindwh)]]+1)]]),collapse=";"),sep=";")
indwh<-indwh+rle_a[ind]
}
out<-unique(out)
So I get what I want but I would really appreciate any hint to simplify it.
Here's an easier approach with tapply:
# index of 'a's
idx <- grepl("^a", vec)
# find groups
grp <- c(0, cumsum(diff(idx) < 0))
# apply function to vector based on groups
unlist(tapply(vec, grp, FUN = function(x)
c(x[1:2], paste("a:", paste(sub("^a:\\s*", "", x[-(1:2)]), collapse = ";")))),
use.names = FALSE)
# [1] "i: 1" "n: alpha" "a: term1;term2"
# [4] "i: 2" "n: beta" "a: term3"
# [7] "i: 3" "n: gamma" "a: term4;term5;term6"

Resources