dataframes in R with multi-level rows and columns - r

Let's say I have the following data frame.
> df = data.frame(rowsA = sample(c('A','B','C'), 100, replace=TRUE),
rowsB = sample(c('D','E','F'), 100, replace=TRUE),
colsA = sample(c('G','H','I'), 100, replace=TRUE),
colsB = sample(c('J','K','L'), 100, replace=TRUE))
> head(df)
rowsA rowsB colsA colsB
1 B E I L
2 A E G J
3 A E H K
4 A D I J
5 C F G J
6 A F G J
Is it possible to create a multi-level table of counts?
In excel, it is possible with the PivotTable functionality
I think it possible in python in pandas with the df.columns.levels method.
I also figured out how to do multi-level rows only in R with dplyr (but haven't figured out multi-level columns)
df %>%
group_by(rowsA, rowsB, colsA) %>%
summarise(count = n()) %>%
spread(colsA, count)
# A tibble: 9 x 5
# Groups: rowsA, rowsB [9]
rowsA rowsB G H I
* <fctr> <fctr> <int> <int> <int>
1 A D 5 3 1
2 A E 1 2 1
3 A F 5 8 NA
4 B D 5 5 5
5 B E 2 4 6
6 B F 4 6 5
7 C D 2 6 NA
8 C E 6 5 3
9 C F 4 3 3

Paste the columns that goes to the header into one column, then reshape it, in which way you have a contingency table with the same meaning as the multi level count:
library(dplyr); library(tidyr)
df %>%
unite(header, c('colsA', 'colsB')) %>%
count(rowsA, rowsB, header) %>%
spread(header, n, fill = 0)
# A tibble: 9 x 11
# rowsA rowsB G_J G_K G_L H_J H_K H_L I_J I_K I_L
#* <fctr> <fctr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 A D 1 0 0 0 3 1 1 1 0
#2 A E 2 0 0 1 1 0 0 0 1
#3 A F 5 0 0 3 2 1 0 1 1
#4 B D 0 1 1 1 0 3 1 1 1
#5 B E 2 2 1 3 1 1 0 3 1
#6 B F 1 1 2 3 3 0 1 2 1
#7 C D 0 2 3 1 2 0 4 3 2
#8 C E 2 2 2 1 2 0 0 1 1
#9 C F 1 0 1 2 0 1 2 1 2
Or if you are OK with a table/array/matrix as result, you can use xtabs, (borrowed from this answer), which essentially gives a 4-d array but with ftable, it can be displayed as you need:
ftable(xtabs(data = df), row.vars = 1:2, col.vars = 3:4)
# colsA G H I
# colsB J K L J K L J K L
#rowsA rowsB
#A D 1 0 0 0 3 1 1 1 0
# E 2 0 0 1 1 0 0 0 1
# F 5 0 0 3 2 1 0 1 1
#B D 0 1 1 1 0 3 1 1 1
# E 2 2 1 3 1 1 0 3 1
# F 1 1 2 3 3 0 1 2 1
#C D 0 2 3 1 2 0 4 3 2
# E 2 2 2 1 2 0 0 1 1
# F 1 0 1 2 0 1 2 1 2

Related

define a new column

I want to define a column which is 1 for rows with col3==loop it is 2 after col3==loop until they get to a row with col3==loop or col3==a or col3==b. otherwise 0.
example
group1 group2 col3
1 1 a
1 1 loop
1 1 d
1 1 c
1 1 b
1 1 f
1 2 loop
1 2 a
1 2 loop
1 2 f
2 1 loop
2 1 g
group1 group2 col3 loop
1 1 a 0
1 1 loop 1
1 1 d 2
1 1 c 2
1 1 b 2
1 1 f 0
1 2 loop 1
1 2 a 2
1 2 h 0
1 2 f 0
2 1 b 0
2 1 g 0
let me know if it is not clear
The following code does what the question asks for. It uses index vectors to assign 1 and 2 to the appropriate places in the new column. I have read in the expected output and created column loop2 to compare the result with the posted column loop. Change this after checking if the answer is right.
i1 <- which(df1$col3 == "loop")
i2 <- which(df1$col3 %in% c("a", "b"))
df1$loop2 <- 0
df1$loop2[i1] <- 1
k <- Map(`:`, (i1 + 1), i2[findInterval(i1, i2) + 1])
df1$loop2[unlist(k)] <- 2
df1
# group1 group2 col3 loop loop2
#1 1 1 a 0 0
#2 1 1 loop 1 1
#3 1 1 d 2 2
#4 1 1 c 2 2
#5 1 1 b 2 2
#6 1 1 f 0 0
#7 1 2 loop 1 1
#8 1 2 a 2 2
#9 1 2 h 0 0
#10 1 2 f 0 0
#11 2 1 b 0 0
#12 2 1 g 0 0
Data.
df1 <- read.table(text = "
group1 group2 col3 loop
1 1 a 0
1 1 loop 1
1 1 d 2
1 1 c 2
1 1 b 2
1 1 f 0
1 2 loop 1
1 2 a 2
1 2 h 0
1 2 f 0
2 1 b 0
2 1 g 0
", header = TRUE)

R: df header columns are ordinal ranking and spread across columns for each observation

I have a questionnaire data that look like below:
items no_stars1 no_stars2 no_stars3 average satisfied bad
1 A 1 0 0 0 0 1
2 B 0 1 0 1 0 0
3 C 0 0 1 0 1 0
4 D 0 1 0 0 1 0
5 E 0 0 1 1 0 0
6 F 0 0 1 0 1 0
7 G 1 0 0 0 0 1
Basically, the header columns (no. of stars rating and satisfactory) are ordinal ranking for each Items. I would like to summarize the no_stars(col 2:4) and satisfactory(col 5:7) into one column so that the output would look like this :
items no_stars satisfactory
1 A 1 1
2 B 2 2
3 C 3 3
4 D 2 3
5 E 3 2
6 F 3 3
7 G 1 1
$no_stars <- 1 is for no_stars1, 2 for no_stars2, 3 for no_stars3
$satisfactory <- 1 is for bad, 2 for average, 3 for good
I have tried the code below
df$no_stars2[df$no_stars2 == 1] <- 2
df$no_stars3[df$no_stars3 == 1] <- 3
df$average[df$average == 1] <- 2
df$satisfied[df$satisfied == 1] <- 3
no_stars <- df$no_stars1 + df$no_stars2 + df$no_stars3
satisfactory <- df$bad + df$average + df$satisfied
tidy_df <- data.frame(df$Items, no_stars, satisfactory)
tidy_df
Is there any function in R that can do the same thing? or
anyone got better and simpler solution ?
Thanks
Just use max.col and set preferences:
starsOrder<-c("no_stars1","no_stars2","no_stars3")
satOrder<-c("bad","average","satisfied")
data.frame(items=df$items,no_stars=max.col(df[,starsOrder]),
satisfactory=max.col(df[,satOrder]))
# items no_stars satisfactory
#1 A 1 1
#2 B 2 2
#3 C 3 3
#4 D 2 3
#5 E 3 2
#6 F 3 3
#7 G 1 1
Another tidyverse solution making use of factor to integer conversions to encode no_stars and satisfactory and spreading from wide to long twice:
library(tidyverse)
df %>%
gather(no_stars, v1, starts_with("no_stars")) %>%
mutate(no_stars = as.integer(factor(no_stars))) %>%
gather(satisfactory, v2, average, satisfied, bad) %>%
filter(v1 > 0 & v2 > 0) %>%
mutate(satisfactory = as.integer(factor(
satisfactory, levels = c("bad", "average", "satisfied")))) %>%
select(-v1, -v2) %>%
arrange(items)
# items no_stars satisfactory
#1 A 1 1
#2 B 2 2
#3 C 3 3
#4 D 2 3
#5 E 3 2
#6 F 3 3
#7 G 1 1
While there may be more elegant solutions, using dplyr::case_when() gives you the flexibility to code things however you want:
library(dplyr)
df %>%
dplyr::mutate(
no_stars = dplyr::case_when(
no_stars1 == 1 ~ 1,
no_stars2 == 1 ~ 2,
no_stars3 == 1 ~ 3)
, satisfactory = dplyr::case_when(
average == 1 ~ 2,
satisfied == 1 ~ 3,
bad == 1 ~ 1)
)
# items no_stars1 no_stars2 no_stars3 average satisfied bad no_stars satisfactory
# 1 A 1 0 0 0 0 1 1 1
# 2 B 0 1 0 1 0 0 2 2
# 3 C 0 0 1 0 1 0 3 3
# 4 D 0 1 0 0 1 0 2 3
# 5 E 0 0 1 1 0 0 3 2
# 6 F 0 0 1 0 1 0 3 3
# 7 G 1 0 0 0 0 1 1 1
dat%>%
replace(.==1,NA)%>%
replace_na(setNames(as.list(names(.)),names(.)))%>%
replace(.==0,NA)%>%
mutate(s=coalesce(!!!.[2:4]),
no_stars=as.numeric(factor(s,unique(s))),
t=coalesce(!!!.[5:7]),
satisfactory=as.numeric(factor(t,unique(t))))%>%
select(items,no_stars,satisfactory)
items no_stars satisfactory
1 A 1 1
2 B 2 2
3 C 3 3
4 D 2 3
5 E 3 2
6 F 3 3
7 G 1 1
using apply and match :
data.frame(
items = df1$items,
no_stars = apply(df1[2:4], 1, match, x=1),
satisfactory = apply(df1[c(7,5:6)], 1, match, x=1))
# items no_stars satisfactory
# 1 A 1 1
# 2 B 2 2
# 3 C 3 3
# 4 D 2 3
# 5 E 3 2
# 6 F 3 3
# 7 G 1 1
data
df1 <- read.table(header=TRUE,stringsAsFactors=FALSE,text="
items no_stars1 no_stars2 no_stars3 average satisfied bad
1 A 1 0 0 0 0 1
2 B 0 1 0 1 0 0
3 C 0 0 1 0 1 0
4 D 0 1 0 0 1 0
5 E 0 0 1 1 0 0
6 F 0 0 1 0 1 0
7 G 1 0 0 0 0 1")

How can I count occurences of different integer values (in a particular column) for rows with different factor values(another column)?

I have a dataframe which looks something like this:
My_Data = data.frame(name = rep(LETTERS[1:10],3), number = sample(0:3,30, replace=TRUE)
name number
1 A 3
2 B 3
3 C 0
4 D 3
5 E 2
6 F 2
7 G 2
8 H 2
9 I 1
10 J 3
11 A 1
12 B 2
13 C 0
14 D 1
15 E 3
16 F 0
17 G 2
18 H 2
19 I 2
20 J 2
21 A 0
22 B 1
23 C 3
24 D 0
25 E 2
26 F 0
27 G 1
28 H 1
29 I 3
30 J 0
Now I would like to get a dataframe which has columns for each of the possible values in the number column and the count of the occurences for each of the number values with respect to each value in the name column
name number_0 number_1 number_2 number_3
1 A 1 1 0 1
2 B 0 1 1 1
3 C 2 0 0 1
4 D 1 1 0 1
5 E 0 0 2 1
6 F 2 0 1 0
7 G 0 1 2 0
8 H 0 1 2 0
9 I 0 1 1 1
10 J 1 0 1 1
How can I do that?
Thanks!
Edit: I am not looking for a conversion to the wide format. I am looking for a way to count occurences for each of the possible values.
You can also use xtabs() function.
xtabs(~My_Data$name + My_Data$number)
We could get the count and then spread to 'wide' format
library(dplyr)
library(tidyr)
My_Data %>%
count(name, number) %>%
mutate(number = paste('number', number, sep='_')) %>%
spread(number, n, fill = 0)
# A tibble: 10 x 5
# name number_0 number_1 number_2 number_3
# * <chr> <dbl> <dbl> <dbl> <dbl>
# 1 A 1 1 0 1
# 2 B 0 1 1 1
# 3 C 2 0 0 1
# 4 D 1 1 0 1
# 5 E 0 0 2 1
# 6 F 2 0 1 0
# 7 G 0 1 2 0
# 8 H 0 1 2 0
# 9 I 0 1 1 1
#10 J 1 0 1 1
Try also:
table(My_Data)
or, if you need a data.frame:
as.data.frame.matrix(table(My_Data))

R: Update adjacency matrix/data frame using pairwise combinations

Question
Let's say I have this dataframe:
# mock data set
df.size = 10
cluster.id<- sample(c(1:5), df.size, replace = TRUE)
letters <- sample(LETTERS[1:5], df.size, replace = TRUE)
test.set <- data.frame(cluster.id, letters)
Will be something like:
cluster.id letters
<int> <fctr>
1 5 A
2 4 B
3 4 B
4 3 A
5 3 E
6 3 D
7 3 C
8 2 A
9 2 E
10 1 A
Now I want to group these per cluster.id and see what kind of letters I can find within a cluster, so for example cluster 3 contains the letters A,E,D,C. Then I want to get all unique pairwise combinations (but not combinations with itself so no A,A e.g.): A,E ; A,D, A,C etc. Then I want to update the pairwise distance for these combination in an adjacency matrix/data frame.
Idea
# group by cluster.id
# per group get all (unique) pairwise combinations for the letters (excluding pairwise combinations with itself, e.g. A,A)
# update adjacency for each pairwise combinations
What I tried
# empty adjacency df
possible <- LETTERS
adj.df <- data.frame(matrix(0, ncol = length(possible), nrow = length(possible)))
colnames(adj.df) <- rownames(adj.df) <- possible
# what I tried
update.adj <- function( data ) {
for (comb in combn(data$letters,2)) {
# stucked
}
}
test.set %>% group_by(cluster.id) %>% update.adj(.)
Probably there is an easy way to do this because I see adjacency matrices all the time, but I'm not able to figure it out.. Please let me know if it's not clear
Answer to comment
Answer to #Manuel Bickel:
For the data I gave as example (the table under "will be something like"):
This matrix will be A-->Z for the full dataset, keep that in mind.
A B C D E
A 0 0 1 1 2
B 0 0 0 0 0
C 1 0 0 1 1
D 1 0 1 0 1
E 2 0 1 1 0
I will explain what I did:
cluster.id letters
<int> <fctr>
1 5 A
2 4 B
3 4 B
4 3 A
5 3 E
6 3 D
7 3 C
8 2 A
9 2 E
10 1 A
Only the clusters containing more > 1 unique letter are relevant (because we don't want combinations with itself, e.g cluster 1 containing only letter B, so it would result in combination B,B and is therefore not relevant):
4 3 A
5 3 E
6 3 D
7 3 C
8 2 A
9 2 E
Now I look for each cluster what pairwise combinations I can make:
cluster 3:
A,E
A,D
A,C
E,D
E,C
D,C
Update these combination in the adjacency matrix:
A B C D E
A 0 0 1 1 1
B 0 0 0 0 0
C 1 0 0 1 1
D 1 0 1 0 1
E 2 0 1 1 0
Then go to the next cluster
cluster 2
A,E
Update the adjacency matrix again:
A B C D E
A 0 0 1 1 2 <-- note the 2 now
B 0 0 0 0 0
C 1 0 0 1 1
D 1 0 1 0 1
E 2 0 1 1 0
As reaction to the huge dataset
library(reshape2)
test.set <- read.table(text = "
cluster.id letters
1 5 A
2 4 B
3 4 B
4 3 A
5 3 E
6 3 D
7 3 C
8 2 A
9 2 E
10 1 A", header = T, stringsAsFactors = F)
x1 <- reshape2::dcast(test.set, cluster.id ~ letters)
x1
#cluster.id A B C D E
#1 1 1 0 0 0 0
#2 2 1 0 0 0 1
#3 3 1 0 1 1 1
#4 4 0 2 0 0 0
#5 5 1 0 0 0 0
x2 <- table(test.set)
x2
# letters
#cluster.id A B C D E
# 1 1 0 0 0 0
# 2 1 0 0 0 1
# 3 1 0 1 1 1
# 4 0 2 0 0 0
# 5 1 0 0 0 0
x1.c <- crossprod(x1)
#Error in crossprod(x, y) :
# requires numeric/complex matrix/vector arguments
x2.c <- crossprod(x2)
#works fine
Following above comment, here the code of Tyler Rinker used with your data. I hope this is what you want.
UPDATE: Following below comments, I added a solution using the package reshape2 in order to be able to handle larger amounts of data.
test.set <- read.table(text = "
cluster.id letters
1 5 A
2 4 B
3 4 B
4 3 A
5 3 E
6 3 D
7 3 C
8 2 A
9 2 E
10 1 A", header = T, stringsAsFactors = F)
x <- table(test.set)
x
letters
#cluster.id A B C D E
# 1 1 0 0 0 0
# 2 1 0 0 0 1
# 3 1 0 1 1 1
# 4 0 2 0 0 0
# 5 1 0 0 0 0
#base approach, based on answer by Tyler Rinker
x <- crossprod(x)
diag(x) <- 0 #this is to set matches such as AA, BB, etc. to zero
x
# letters
# letters
# A B C D E
# A 0 0 1 1 2
# B 0 0 0 0 0
# C 1 0 0 1 1
# D 1 0 1 0 1
# E 2 0 1 1 0
#reshape2 approach
x <- acast(test.set, cluster.id ~ letters)
x <- crossprod(x)
diag(x) <- 0
x
# A B C D E
# A 0 0 1 1 2
# B 0 0 0 0 0
# C 1 0 0 1 1
# D 1 0 1 0 1
# E 2 0 1 1 0

Conditional variable using R code

I have a data set named "dats".
id y i j
1 0 1 1
1 0 1 2
1 0 1 3
2 1 2 1
2 1 2 2
2 1 2 3
I want to calculate, a new variable ynew=(yij-1*yij) based on (y11*y12, y12*y13....so on). I have tried in this way:
ynew <- NULL
for(p in 1)
{
for (q in ni)
{
ynew[p,q] <- dats$y[dats$i==p & dats$j==q-1]*dats$y[dats$i==p & dats$j==q]
}
}
ynew
But it showing error!
Expected output
id y i j ynew
1 0 1 1 NA
1 0 1 2 0
1 0 1 3 0
2 1 2 1 NA
2 1 2 2 1
2 1 2 3 1
Could anybody help? TIA
Using dplyr and rollapply from zoo package,
library(dplyr)
library(zoo)
dats %>%
group_by(id) %>%
mutate(ynew = c(NA, rollapply(y, 1, by = 2, prod)))
#Source: local data frame [6 x 5]
#Groups: id [2]
# id y i j ynew
# (int) (int) (int) (int) (dbl)
#1 1 0 1 1 NA
#2 1 0 1 2 0
#3 1 0 1 3 0
#4 2 1 2 1 NA
#5 2 1 2 2 1
#6 2 1 2 3 1
May be we need to just multiply with the lag of 'y' grouped by 'id'
library(data.table)
setDT(dats)[, ynew := y * shift(y), by = id]
dats
# id y i j ynew
#1: 1 0 1 1 NA
#2: 1 0 1 2 0
#3: 1 0 1 3 0
#4: 2 1 2 1 NA
#5: 2 1 2 2 1
#6: 2 1 2 3 1
It could also be done with roll_prod
library(RcppRoll)
setDT(dats)[, ynew := c(NA, roll_prod(y, 2)), by = id]
dats
# id y i j ynew
#1: 1 0 1 1 NA
#2: 1 0 1 2 0
#3: 1 0 1 3 0
#4: 2 1 2 1 NA
#5: 2 1 2 2 1
#6: 2 1 2 3 1

Resources