Generate matrix in RevoScaleR from a data frame - r

I have a data frame as below:
group sex age
A M 15
A F 17
A M 12
A F 2
A F 6
A M 3
A M 10
A M 18
B F 16
B M 6
B M 18
B M 15
B F 8
B F 17
B M 18
B M 16
B F 13
B F 5
B F 13
B F 4
B M 15
B M 8
B M 18
C F 7
C M 12
C M 3
C F 1
C F 9
C F 2
expected result for this data frame.
A B C
A 0 4 3
B 4 0 0
C 3 0 0
I would like to generate a matrix showing the similarity among "group" in input data, based on the "age". For example, if group A and group B have 2 similar ages, then the common element A and B will be 2.

One solution with outer:
library(magrittr)
func = Vectorize(function(u,v)
{
if(all(u==v)) return(0)
intersect(subset(df, group==u)$age, subset(df, group==v)$age) %>% unique %>% length
})
x = df$group %>% unique
m = outer(x, x, func)
row.names(m) = colnames(m) = x
#>m
# A B C
#A 0 4 3
#B 4 0 0
#C 3 0 0

We could merge the dataset ("df") to itself by "age" on a subset of dataset ("df[-2]", ie. the second column is removed), remove the rows that are the same for "group.x" and "group.y", and reshape the unique dataset ("df1") from "long" to "wide" using acast.
df1 <- subset(merge(df[-2], df[-2], by.x='age',
by.y='age'), group.x!=group.y)
library(reshape2)
acast(unique(df1), group.x~group.y, value.var='age')
# A B C
#A 0 4 3
#B 4 0 0
#C 3 0 0
Or use xtabs from base R
xtabs(~group.x+group.y, unique(df1))
# group.y
#group.x A B C
# A 0 4 3
# B 4 0 0
# C 3 0 0
Update
Regarding the new dataset/expected result, it is not clear which column should be included in the relationship with "re". Here, I used "pro_id" to get the expected result.
tbl <- crossprod(table(df[c(3,1)]))
diag(tbl) <- 0
tbl
# re
#re 144 205 209 222 235 250
# 144 0 1 2 0 0 0
# 205 1 0 1 0 0 0
# 209 2 1 0 0 0 0
# 222 0 0 0 0 0 1
# 235 0 0 0 0 0 0
# 250 0 0 0 1 0 0

Related

A Custom sort of the values within a dataframe in R

I am a newbie trying to learn R and I have a data frame like this:
a b c d
a 0 6 2 0
b 1 0 3 0
c 0 0 0 2
d 0 0 0 0
I want to sort a dataframe by two actions:
1. First, find the row which has the maximum TOTAL value and creating this
a b c d TOTAL
a 0 6 2 0 8
b 1 0 3 0 4
c 0 0 0 2 2
d 0 0 0 0 0
Second, select the row with the maximum value and recording the crossed
value in front of each character from max to min. So it results into a new dataframe like this:
'x'
a-b 6 #considering values for "a" where it meets "b"
a-c 2
b-c 3 #b has the second max TOTAL value
b-b 1
c-d 2 # finally, values in front of c
I'd appreciate your help on this one.
EDIT: adding source data at bottom
library(tidyr); library(dplyr)
df %>%
gather(col, val, -row) %>% # Pull into long form, with one row for each row-col
arrange(row, -val) %>% # Sort by row and descending value
filter(val != 0) %>% # Only keep non-zeros
unite("row", c("row", "col"))# combine row and col columns
row val
1 a_b 6
2 a_c 2
3 b_c 3
4 b_a 1
5 c_d 2
# Inputing data with "row" column
df <- read.table(
header = T,
stringsAsFactors = F,
text = "row a b c d
a 0 6 2 0
b 1 0 3 0
c 0 0 0 2
d 0 0 0 0 ")
Not completely certain, but is this what you want? You say you have a dataframe but it looks more like you have a matrix and it's not clear if you want to keep your first action or if that's just an intermediate step.
mat <- as.matrix(df)
df1 <- data.frame(addmargins(mat, 2))
df1
a b c d Sum
a 0 6 2 0 8
b 1 0 3 0 4
c 0 0 0 2 2
d 0 0 0 0 0
df2 <- as.data.frame(as.table(mat))
df2 <- df2[df2$Freq != 0,]
df2[with(df2, order(ave(Freq, Var1, FUN = sum), Freq, decreasing = TRUE)), ]
Var1 Var2 Freq
5 a b 6
9 a c 2
10 b c 3
2 b a 1
15 c d 2
Data:
df <- read.table(text="a b c d
0 6 2 0
1 0 3 0
0 0 0 2
0 0 0 0", header = TRUE, row.names = letters[1:4])
First question is just rowSums , for you second I am using melt , then order with groupby max and the value itself
s=setNames(reshape2::melt(as.matrix(df)), c('rows', 'vars', 'values'))
s=s[s$values!=0,]
s[order(-ave(s$values,s$rows,FUN=max),-s$values),]
rows vars values
5 a b 6
9 a c 2
10 b c 3
2 b a 1
15 c d 2

add column with total count of rows meeting a condition in dplyr

Trying to get totals by class and condition but not grouping data.
Reproducible example:
df <- data.frame("class" = c("a","b","c","d","b","b","b","b","c","c","a"),"increment" = c(0,0,0,0,0,0,32,12,0,0,0))
R> df
class increment
1 a 0
2 b 0
3 c 0
4 d 0
5 b 0
6 b 0
7 b 32
8 b 12
9 c 0
10 c 0
11 a 0
I want the total cases where increment is different from Zero but for every class.
Desired output:
R> df
class increment increment_count_per_class
1 a 0 0
2 b 0 2
3 c 0 0
4 d 0 0
5 b 0 2
6 b 0 2
7 b 32 2
8 b 12 2
9 c 0 0
10 c 0 0
11 a 0 0
My first approach is here below, but I know there must be a less convoluted way using dplyr:
df <- df %>% mutate(has.increment = ifelse(increment>0,1,0))
R> df
class increment has.increment
1 a 0 0
2 b 0 0
3 c 0 0
4 d 0 0
5 b 0 0
6 b 0 0
7 b 32 1
8 b 12 1
9 c 0 0
10 c 0 0
11 a 0 0
Get totals per class when increment exists
N <- df %>% group_by(class,has.increment) %>% tally() %>% filter(has.increment == 1)
R> N
# A tibble: 1 x 3
# Groups: class [1]
class has.increment n
<chr> <dbl> <int>
1 b 1 2
Then join:
merge(N,df, by = "class", all = TRUE)
R> merge(N,df, by = "class", all = TRUE)
class has.increment.x n increment has.increment.y
1 a NA NA 0 0
2 a NA NA 0 0
3 b 1 2 0 0
4 b 1 2 12 1
5 b 1 2 0 0
6 b 1 2 0 0
7 b 1 2 32 1
8 c NA NA 0 0
9 c NA NA 0 0
10 c NA NA 0 0
11 d NA NA 0 0
Try this:
df %>%
group_by(class) %>%
mutate(increment_count_per_class = sum(increment!=0))

How to replace certain values with their column name

I have a following table in R
df <- data.frame('a' = c(1,0,0,1,0),
'b' = c(1,0,0,1,0),
'c' = c(1,1,0,1,1))
df
a b c
1 1 1 1
2 0 0 1
3 0 0 0
4 1 1 1
4 0 0 1
What I want is to replace the row value with the column name whenever the row is equal to 1. The output would be this one:
a b c
1 a b c
2 0 0 c
3 0 0 0
4 a b c
4 0 0 c
How can I do this in R? Thanks.
I would use Map and replace:
df[] <- Map(function(n, x) replace(x, x == 1, n), names(df), df)
df
# a b c
# 1 a b c
# 2 0 0 c
# 3 0 0 0
# 4 a b c
# 5 0 0 c
We can use
df[] <- names(df)[(NA^!df) * col(df)]
df[is.na(df)] <- 0
df
# a b c
#1 a b c
#2 0 0 c
#3 0 0 0
#4 a b c
#4 0 0 c
You can try stack and unstack
a=stack(df)
a
values ind
1 1 a
2 0 a
3 0 a
4 1 a
5 0 a
6 1 b
7 0 b
8 0 b
9 1 b
10 0 b
11 1 c
12 1 c
13 0 c
14 1 c
15 1 c
a$values[a$values==1]=as.character(a$ind)[a$values==1]
unstack(a)
a b c
1 a b c
2 0 0 c
3 0 0 0
4 a b c
5 0 0 c
We can try iterating over the names of the data frame, and then handling each column, for a base R option:
df <- data.frame(a=c(1,0,0,1,0), b=c(1,0,0,1,0), c=c(1,1,0,1,1))
df <- data.frame(sapply(names(df), function(x) {
y <- df[[x]]
y[y == 1] <- x
return(y)
}))
df
a b c
1 a b c
2 0 0 c
3 0 0 0
4 a b c
5 0 0 c
Demo
You can do it with ifelse, but you have to do some intermediate transposing to account for R's column-major order processing.
data.frame(t(ifelse(t(df)==1,names(df),0)))
a b c
1 a b c
2 0 0 c
3 0 0 0
4 a b c
5 0 0 c

R: Update adjacency matrix/data frame using pairwise combinations

Question
Let's say I have this dataframe:
# mock data set
df.size = 10
cluster.id<- sample(c(1:5), df.size, replace = TRUE)
letters <- sample(LETTERS[1:5], df.size, replace = TRUE)
test.set <- data.frame(cluster.id, letters)
Will be something like:
cluster.id letters
<int> <fctr>
1 5 A
2 4 B
3 4 B
4 3 A
5 3 E
6 3 D
7 3 C
8 2 A
9 2 E
10 1 A
Now I want to group these per cluster.id and see what kind of letters I can find within a cluster, so for example cluster 3 contains the letters A,E,D,C. Then I want to get all unique pairwise combinations (but not combinations with itself so no A,A e.g.): A,E ; A,D, A,C etc. Then I want to update the pairwise distance for these combination in an adjacency matrix/data frame.
Idea
# group by cluster.id
# per group get all (unique) pairwise combinations for the letters (excluding pairwise combinations with itself, e.g. A,A)
# update adjacency for each pairwise combinations
What I tried
# empty adjacency df
possible <- LETTERS
adj.df <- data.frame(matrix(0, ncol = length(possible), nrow = length(possible)))
colnames(adj.df) <- rownames(adj.df) <- possible
# what I tried
update.adj <- function( data ) {
for (comb in combn(data$letters,2)) {
# stucked
}
}
test.set %>% group_by(cluster.id) %>% update.adj(.)
Probably there is an easy way to do this because I see adjacency matrices all the time, but I'm not able to figure it out.. Please let me know if it's not clear
Answer to comment
Answer to #Manuel Bickel:
For the data I gave as example (the table under "will be something like"):
This matrix will be A-->Z for the full dataset, keep that in mind.
A B C D E
A 0 0 1 1 2
B 0 0 0 0 0
C 1 0 0 1 1
D 1 0 1 0 1
E 2 0 1 1 0
I will explain what I did:
cluster.id letters
<int> <fctr>
1 5 A
2 4 B
3 4 B
4 3 A
5 3 E
6 3 D
7 3 C
8 2 A
9 2 E
10 1 A
Only the clusters containing more > 1 unique letter are relevant (because we don't want combinations with itself, e.g cluster 1 containing only letter B, so it would result in combination B,B and is therefore not relevant):
4 3 A
5 3 E
6 3 D
7 3 C
8 2 A
9 2 E
Now I look for each cluster what pairwise combinations I can make:
cluster 3:
A,E
A,D
A,C
E,D
E,C
D,C
Update these combination in the adjacency matrix:
A B C D E
A 0 0 1 1 1
B 0 0 0 0 0
C 1 0 0 1 1
D 1 0 1 0 1
E 2 0 1 1 0
Then go to the next cluster
cluster 2
A,E
Update the adjacency matrix again:
A B C D E
A 0 0 1 1 2 <-- note the 2 now
B 0 0 0 0 0
C 1 0 0 1 1
D 1 0 1 0 1
E 2 0 1 1 0
As reaction to the huge dataset
library(reshape2)
test.set <- read.table(text = "
cluster.id letters
1 5 A
2 4 B
3 4 B
4 3 A
5 3 E
6 3 D
7 3 C
8 2 A
9 2 E
10 1 A", header = T, stringsAsFactors = F)
x1 <- reshape2::dcast(test.set, cluster.id ~ letters)
x1
#cluster.id A B C D E
#1 1 1 0 0 0 0
#2 2 1 0 0 0 1
#3 3 1 0 1 1 1
#4 4 0 2 0 0 0
#5 5 1 0 0 0 0
x2 <- table(test.set)
x2
# letters
#cluster.id A B C D E
# 1 1 0 0 0 0
# 2 1 0 0 0 1
# 3 1 0 1 1 1
# 4 0 2 0 0 0
# 5 1 0 0 0 0
x1.c <- crossprod(x1)
#Error in crossprod(x, y) :
# requires numeric/complex matrix/vector arguments
x2.c <- crossprod(x2)
#works fine
Following above comment, here the code of Tyler Rinker used with your data. I hope this is what you want.
UPDATE: Following below comments, I added a solution using the package reshape2 in order to be able to handle larger amounts of data.
test.set <- read.table(text = "
cluster.id letters
1 5 A
2 4 B
3 4 B
4 3 A
5 3 E
6 3 D
7 3 C
8 2 A
9 2 E
10 1 A", header = T, stringsAsFactors = F)
x <- table(test.set)
x
letters
#cluster.id A B C D E
# 1 1 0 0 0 0
# 2 1 0 0 0 1
# 3 1 0 1 1 1
# 4 0 2 0 0 0
# 5 1 0 0 0 0
#base approach, based on answer by Tyler Rinker
x <- crossprod(x)
diag(x) <- 0 #this is to set matches such as AA, BB, etc. to zero
x
# letters
# letters
# A B C D E
# A 0 0 1 1 2
# B 0 0 0 0 0
# C 1 0 0 1 1
# D 1 0 1 0 1
# E 2 0 1 1 0
#reshape2 approach
x <- acast(test.set, cluster.id ~ letters)
x <- crossprod(x)
diag(x) <- 0
x
# A B C D E
# A 0 0 1 1 2
# B 0 0 0 0 0
# C 1 0 0 1 1
# D 1 0 1 0 1
# E 2 0 1 1 0

transform dataframe to a matrix in R [duplicate]

This question already has answers here:
How to reshape data from long to wide format
(14 answers)
Closed 6 years ago.
I have this data frame :
Var1 var 2 var3
var1 var2 var3
A B 1
B C 2
B A 3
D C 4
B D 5
And I would like to transform it to a matrix And add a column and a row to sum the values associated to each variable like this using R code:
A B C D Total
A 0 1 0 0 1
B 3 0 2 5 10
C 0 0 0 0 0
D 0 0 4 0 4
T 3 1 6 5
Can you suggest me a way of doing it ?
Thanks a lot!!
nms <- sort(unique(c(as.character(df$var1),as.character(df$var2))));
m <- matrix(vector(typeof(df$var3),1L),length(nms),length(nms),dimnames=list(nms,nms));
m[cbind(as.character(df$var1),as.character(df$var2))] <- df$var3;
m;
## A B C D
## A 0 1 0 0
## B 3 0 2 5
## C 0 0 0 0
## D 0 0 4 0
The as.character() coercions can be omitted if the var1 and var2 input columns are already character vectors.
Data
df <- data.frame(var1=c('A','B','B','D','B'),var2=c('B','C','A','C','D'),var3=c(1L,2L,3L,4L,
5L));
Marginal totals can be added as follows:
m <- cbind(m,Total=rowSums(m));
m <- rbind(m,T=colSums(m));
m;
## A B C D Total
## A 0 1 0 0 1
## B 3 0 2 5 10
## C 0 0 0 0 0
## D 0 0 4 0 4
## T 3 1 6 5 15

Resources