convert data.frame to another data.frame - r

I have two data.frames as follows:
dt2017 = data.frame(id=LETTERS[1:5],year=2017,city1=c(0,1,0,1,0),city2=c(0,0,1,0,0),city3=c(1,0,1,0,1),city4=c(0,0,0,0,1))
dt2017
id year city1 city2 city3 city4
1: A 2017 0 0 1 0
2: B 2017 1 0 0 0
3: C 2017 0 1 1 0
4: D 2017 1 0 0 0
5: E 2017 0 0 1 1
dt2016 = data.frame(id=LETTERS[1:5],year=2016,city1=c(0,0,0,0,1),city2=c(0,0,0,1,0),city3=c(0,0,1,0,1),city4=c(1,1,0,0,1))
dt2016
id year city1 city2 city3 city4|
1: A 2016 0 0 0 1
2: B 2016 0 0 0 1
3: C 2016 0 0 1 0
4: D 2016 0 1 0 0
5: E 2016 1 0 1 1
"1" in the data.frame can represent working in the city. For example, in 2016, A,B and E work in the same city4. First, I want to get following data.frame:
id 2016 2017 2016+2017
1: A B;E C;E B;C;E
2: B A;E D A;D;E
3: C E A;E A;E
4: D NA B B
5: E A;B;C A;C A;B;C
Second, I want to get a data.frame like this:
id relation
A B
A C
A E
B A
B D
B E
D B
E A
E B
E C
Any suggestions would be greatly appreciated.

I have found a way to achieve what you want, but it is not very pretty. Still it does what you want.
library(plyr)
dt2017$id <- as.character(dt2017$id)
dt2016$id <- as.character(dt2016$id)
id <- dt2017$id
my_function <- function(dt, x){
tmp <- data.frame(id=id, dt[,dt[dt$id==x,]==1])
tmp$ind <- sapply(1:nrow(tmp), function(x) return(sum(tmp[x, 2:ncol(tmp)])))
return(paste(tmp[tmp$ind > 0 & tmp$id !=x,"id"], collapse=";"))
}
results1 <- data.frame(Year2016 = sapply(id, function(x) return(my_function(dt2016,x))),
Year2017 = sapply(id, function(x) return(my_function(dt2017,x))))
my_function2 <- function(dt, x){
tmp <- data.frame(id=id, dt[,dt[dt$id==x,]==1])
tmp$ind <- sapply(1:nrow(tmp), function(x) return(sum(tmp[x, 2:ncol(tmp)])))
if(length(tmp[tmp$ind > 0 & tmp$id !=x,"id"])!=0){
return(data.frame(id=x, relation=tmp[tmp$ind > 0 & tmp$id !=x,"id"]))
}
}
results_tmp <- rbind(adply(.data=id, .margins=1, .fun= function(x) return(my_function2(dt2016,x))),
adply(.data=id, .margins=1, .fun= function(x) return(my_function2(dt2017,x))))[, c("id", "relation")]
results2 <- unique(results_tmp[order(as.character(results_tmp$id)),])
fun_tmp <- function(x) return(paste(x, collapse=";"))
bothyear_tmp <- aggregate(list(relation=results2$relation), by=list(id=results2 $"id"), FUN=fun_tmp)
results1$BothYear <- bothyear_tmp[order(as.character(bothyear_tmp$id)),"relation"]
And here are the results:
results1
Year2016 Year2017 BothYear
A B;E C;E B;E;C
B A;E D A;E;D
C E A;E E;A
D B B
E A;B;C A;C A;B;C
results2
id relation
A B
A E
A C
B A
B E
B D
C E
C A
D B
E A
E B
E C

Related

Detecting key words across multiple columns and flagging them each in new columns

I have this type of data:
set.seed(123)
df <- data.frame(
v1 = sample(LETTERS[1:10], 5),
v2 = sample(LETTERS[1:10], 5),
v3 = sample(LETTERS[1:10], 5),
v4 = sample(LETTERS[1:10], 5)
)
as well as a number of key words:
keys <- c("A", "C", "F", "H")
I want to (i) detect which keyword is present in each column of df and (ii) create new columns for each of the keys, recording with 1if that key is present. I've been using a for loop so far, which, however, does not work:
library(stringr)
for(i in keys){
df[i] <- +str_detect(apply(df, 1, paste0, collapse = " "), keys[i])
}
df
v1 v2 v3 v4 A C F H
1 C A J I NA NA NA NA
2 H E E C NA NA NA NA
3 D H F A NA NA NA NA
4 G D I J NA NA NA NA
5 F C A F NA NA NA NA
I don't know why it doesn't work, as the same logic applied to a single key does work:
+str_detect(apply(df, 1, paste0, collapse = " "), keys[1])
[1] 1 0 1 0 1
I'd be grateful for advice on how the for loop must be tweaked but also for an entirely different solution to the problem (e.g., dplyr).
A base solution:
cbind(df, +sapply(keys, grepl, x = do.call(paste, df)))
# v1 v2 v3 v4 A C F H
# 1 C E J I 0 1 0 0
# 2 J D E J 0 0 0 0
# 3 B F C E 0 1 1 0
# 4 H H H C 0 1 0 1
# 5 F A A B 1 0 1 0
In your for loop you are indexing on both the index and the name. Test keys["A"], it'll return NA. You can use which to avoid this:
library(stringr)
for(i in keys){
df[i] <- ifelse(str_detect(apply(df, 1, paste0, collapse = " "), keys[which(keys == i)]), 1, 0)
}
output
v1 v2 v3 v4 A C F H
1 C E J I 0 1 0 0
2 J D E J 0 0 0 0
3 B F C E 0 1 1 0
4 H H H C 0 1 0 1
5 F A A B 1 0 1 0
Another option would be to index on the position (1, 2, 3, 4):
for(i in seq_along(keys)){
df[keys[i]] <- ifelse(str_detect(apply(df, 1, paste0, collapse = " "), keys[i]), 1, 0)
}
Using dplyr's if_any:
library(stringr)
library(dplyr)
library(purrr)
bind_cols(df,
map_dfc(keys, \(x) df %>%
transmute(!!(x) := +(if_any(v1:v4, \(y) str_detect(x, y))))))
output
v1 v2 v3 v4 A C F H
1 C E J I 0 1 0 0
2 J D E J 0 0 0 0
3 B F C E 0 1 1 0
4 H H H C 0 1 0 1
5 F A A B 1 0 1 0

Replacing zero and one in data frame in R with letters

I have a data frame (df) which looks like this:
df$a df$b
T C 1 1 1 1
A G 0 1 1 0
C G 0 0 1 1
A T 0 0 0 0
I would like to have this output:
T C C C C C
A G A G G A
C G C C G G
A T A A A A
Basically if it is zero it takes the letter in column df$a while if it is 1 it takes the letter in column df$b and the zeros and ones are replaced.
Anybody knows how to do this in R?
You can do this with base functionality - note that I read the data in columns 1 and 2 as characters, not factors.
df <- read.table(text = "
T C 1 1 1 1
A G 0 1 1 0
C G 0 0 1 1
A T 0 0 0 0", header = FALSE, stringsAsFactors = FALSE)
df[, 3:ncol(df)] <- sapply(df[, 3:ncol(df)], function(x) ifelse(x == 1, df[, 2], df[, 1]))
df
#> V1 V2 V3 V4 V5 V6
#> 1 T C C C C C
#> 2 A G A G G A
#> 3 C G C C G G
#> 4 A T A A A A
Created on 2019-01-25 by the reprex package (v0.2.1)
df[, -(1:2)] <- ifelse(df[, -(1:2)] == 1, df[, 2], df[, 1])
Here is a base R solution.
df[-(1:2)] <- t(apply(df, 1, function(x) {
y <- as.numeric(x[-(1:2)])
x[1:2][y + 1]
}))
df
# a b c d e f
#1 T C C C C C
#2 A G A G G A
#3 C G C C G G
#4 A T A A A A
Data.
df <- read.table(text = "
a b c d e f
T C 1 1 1 1
A G 0 1 1 0
C G 0 0 1 1
A T 0 0 0 0
", header = TRUE)

Convert list of individuals to occurence of pairs in R

I need specific format of data.frame for social structure analysis. How to convert data.frame containing list of individuals occuring together on multiple events:
my.df <- data.frame(individual = c("A","B","C","B","C","D"),
time = rep(c("event_01","event_02"), each = 3))
individual time
1 A event_01
2 B event_01
3 C event_01
4 B event_02
5 C event_02
6 D event_02
into a data.frame containing occurence for each pairs (including [A,A]; [B,B] etc. pairs:
ind_1 ind_2 times
A A 0
A B 1
A C 1
A D 0
B A 1
B B 0
B C 2
B D 1
C A 1
C B 2
C C 0
C D 1
D A 0
D B 1
D C 1
D D 0
In base R, you could do the following:
data.frame(as.table(`diag<-`(tcrossprod(table(my.df)), 0)))
# individual individual.1 Freq
# 1 A A 0
# 2 B A 1
# 3 C A 1
# 4 D A 0
# 5 A B 1
# 6 B B 0
# 7 C B 2
# 8 D B 1
# 9 A C 1
# 10 B C 2
# 11 C C 0
# 12 D C 1
# 13 A D 0
# 14 B D 1
# 15 C D 1
# 16 D D 0
tcrossprod gives you the following:
> tcrossprod(table(my.df))
individual
individual A B C D
A 1 1 1 0
B 1 2 2 1
C 1 2 2 1
D 0 1 1 1
That's essentially all the information you are looking for, but you want it in a slightly different form, without the diagonal values.
We can set the diagonals to zero with:
`diag<-`(theOutputFromAbove, 0)
Then, to get the long form, trick R into thinking that the resulting matrix is a table by using as.table, and make use of the data.frame method for tables.
You can do:
create the first 2 variables of the new data.frame:
df2 <- expand.grid(ind_2=levels(my.df$individual), ind_1=levels(my.df$individual))[, 2:1]
Put the value to 0 for the pairs of same individuals:
df2$times[df2[, 1]==df2[, 2]] <- 0
See the other unique combinations:
comb_diff <- combn(levels(my.df$individual), 2)
compute the times each unique combination is found together:
times_uni <- apply(comb_diff, 2, function(inds){
sum(table(my.df$time[my.df$individual %in% inds])==2)
})
Finally, fill the new data.frame:
df2$times[match(c(paste0(comb_diff[1,], comb_diff[2,]), paste0(comb_diff[2, ], comb_diff[1, ])), paste0(df2[, 1],df2[, 2]))] <- rep(times_uni, 2)
df2
# ind_1 ind_2 times
#1 A A 0
#2 A B 1
#3 A C 1
#4 A D 0
#5 B A 1
#6 B B 0
#7 B C 2
#8 B D 1
#9 C A 1
#10 C B 2
#11 C C 0
#12 C D 1
#13 D A 0
#14 D B 1
#15 D C 1
#16 D D 0
You can do it using data.table
dt_combs <- my.dt[,
list(ind_1 = combn(individual, 2)[1, ],
ind_2 = combn(individual, 2)[2, ]),
by = time]
dt_ncombs <- dt_combs[, .N, by = c("ind_1", "ind_2")]
dt_ncombs_inverted <- copy(dt_ncombs)
dt_ncombs_inverted[, temp := ind_1]
dt_ncombs_inverted[, ind_1 := ind_2]
dt_ncombs_inverted[, ind_2 := temp]
dt_ncombs_inverted[, temp := NULL]
dt_ncombs <- rbind(dt_ncombs, dt_ncombs_inverted)
dt_allcombs <- data.table(expand.grid(
ind_1 = my.dt[, unique(individual)],
ind_2 = my.dt[, unique(individual)]
))
dt_final <- merge(dt_allcombs,
dt_ncombs,
all.x = TRUE,
by = c("ind_1", "ind_2"))
dt_final[is.na(N), N := 0]
dt_final

Create "contingency" table with multi-rows

Let's consider this dataset, where the first field is a bill number and the second one is the name of a product :
df=data.frame(bill=c(1,1,1,1,2,2,2,2,3,3),product=c("A","B","C","B","A","C","E","D","C","D"))
I would like to count the number of bills containing each combination of two products, for example in this case a result like this (I don't want to keep combinations where count is 0) :
# prod1 prod2 count
# A B 1
# A C 2
# A D 1
# A E 1
# B C 1
# C D 2
# C E 1
# D E 1
I have a solution with loops but it's really not pretty (and slow !):
products=sort(unique(df$product))
bills_list=list()
for (i in 1:length(products)){
bills_list[[i]]=unique(df[which(df$product==products[i]),"bill"])
}
df2=data.frame(prod1=character(0),prod2=character(0),count=numeric(0))
for (i in 1:(length(products)-1)){
for (j in (i+1):length(products)){
Nij=length(intersect(bills_list[[i]],bills_list[[j]]))
if (Nij>0){
temp=data.frame(prod1=products[i],prod2=products[j],count=Nij)
df2=rbind(df2,temp)
}
}
}
Is there a way to do this without loops ?
Thank you for your time.
Here's a solution with plyr and data.table.
# needed packages
require(plyr)
require(data.table)
# find the combinations in each of the bills
combs <- ddply(df, .(bill), function(x){
t(combn(unique(as.character(x$product)),2))
})
colnames(combs) <- c("bill", "prod1", "prod2")
# combine these
res <- data.table(combs, key=c("prod1", "prod2"))[, .N, by=list(prod1, prod2)]
library(reshape2)
df$product <- as.character(df$product)
products <- t(combn(unique(df$product), 2))
dat <- dcast(bill ~ product, data = df)
## bill A B C D E
## 1 1 1 2 1 0 0
## 2 2 1 0 1 1 1
## 3 3 0 0 1 1 0
out <- structure(
data.frame(products, apply(products, 1, function(x) sum(rowSums(dat[x] > 0) == 2) )),
names = c("prod1", "prod2", "count")
)
out[out$count != 0,]
## prod1 prod2 count
## 1 A B 1
## 2 A C 2
## 3 A E 1
## 4 A D 1
## 5 B C 1
## 8 C E 1
## 9 C D 2
## 10 E D 1
Here's another approach:
library(qdap)
dat <- unlist(lapply(split(df$product, df$bill), function(x) {
y <- outer(unique(x), unique(x), paste)
unlist(y[upper.tri(y)])
}))
dat2 <- data.frame(table(dat), stringsAsFactors = FALSE)
colsplit2df(dat2, sep=" ", new.names=paste0("prod", 1:2))
## prod1 prod2 Freq
## 1 A B 1
## 2 A C 2
## 3 A D 1
## 4 A E 1
## 5 B C 1
## 6 C D 2
## 7 C E 1
## 8 E D 1
res <- table(df$bill, df$product)
##> res
##
## A B C D E
## 1 1 2 1 0 0
## 2 1 0 1 1 1
## 3 0 0 1 1 0
res2 <- ifelse(res > 0, 1, 0)
##> res2
##
## A B C D E
## 1 1 1 1 0 0
## 2 1 0 1 1 1
## 3 0 0 1 1 0
cor(res2)
##
## A B C D E
##A 1.0 0.5 NA -0.5 0.5
##B 0.5 1.0 NA -1.0 -0.5
##C NA NA 1 NA NA
##D -0.5 -1.0 NA 1.0 0.5
##E 0.5 -0.5 NA 0.5 1.0
##Warning message:
##In cor(res2) : the standard deviation is zero
I do realize that this does not answer the question that you asked.
But, it may get you closer to the answer that, presumably, you seek. Namely, what is the impact of a customer ordering one product on the likelihood (positive or negative) that will order one of the others.

How can I add and calculate a column's value in R

This is the first part of my code:
BSum=0.0
mydata = NULL
while(BSum < 5)
{
A=(rpois (1, lambda=1))
y=runif(A,0,1)
B1 = length(which(y<=0.5))
BSum = BSum + B1
C = A - B1
mydata=rbind(mydata,c("A"=A,"B"=B1,"C"=C))
}
I need 3 more columns here. For column D(row x) I would generate as many random nos. (between 0 and 1) as is the value in Column B(row x). Then I see how many of those random nos. are less than or equal to 0.1. I put the total count of these in Column D. The remainder (B-D) becomes column F. I generate another column E that will get populated the same same way D was generated from B. The remainder again gets added to what had accumulated in Column F.
:= is from the data.table package. As you don't have this loaded, either your object isn't a data.table object or it is but you don't say and the package was not loaded.
If the former (your object is a data frame, not a data.table) then you want cbind(). As in:
set.seed(1)
df <- data.frame(A = runif(10))
cbind(df, list(B = runif(10), C = letters[1:10]))
> cbind(df, list(B = runif(10), C = letters[1:10]))
A B C
1 0.26550866 0.2059746 a
2 0.37212390 0.1765568 b
3 0.57285336 0.6870228 c
4 0.90820779 0.3841037 d
5 0.20168193 0.7698414 e
6 0.89838968 0.4976992 f
7 0.94467527 0.7176185 g
8 0.66079779 0.9919061 h
9 0.62911404 0.3800352 i
10 0.06178627 0.7774452 j
For your particular problem, try:
myfun <- function(z) {
ret1 <- apply(z, 1, function(x) sum(runif(x) <= 0.1))
ret2 <- z[,1] - ret1
cbind(z, B = ret1, C = ret2)
}
set.seed(1)
df <- data.frame(A = rpois(10, 2))
myfun(df)
> myfun(df)
A B C
1 1 0 1
2 1 0 1
3 2 0 2
4 4 0 4
5 1 0 1
6 4 0 4
7 4 1 3
8 2 0 2
9 2 0 2
10 0 0 0
You could make this more efficient, say by not doing each row individually, but it'd involve more coding.
Updated
If I understand your update (and I might not as I already showed you how to do those steps, though not in the same configuration as you now want), then I think this is what you wanted. Note that how you create E is a little ambiguous. I took you literally and just did exactly the same as for D.
set.seed(2)
BSum <- 0.0
mydata <- NULL
while(BSum < 5) {
A <- rpois(1, lambda = 1)
B1 <- sum(runif(A, 0, 1) <= 0.5)
BSum <- BSum + B1
C <- A - B1
D <- sum(runif(B1) <= 0.1)
F <- B1 - D
E <- sum(runif(B1) <= 0.1)
F <- F + (D - E)
mydata <- rbind(mydata, c(A = A, B = B1, C = C, D = D, E = E, F = F))
}
With that seed I get
R> mydata
A B C D E F
[1,] 0 0 0 0 0 0
[2,] 1 0 1 0 0 0
[3,] 0 0 0 0 0 0
[4,] 3 1 2 0 0 1
[5,] 1 1 0 0 0 1
[6,] 1 0 1 0 0 0
[7,] 3 3 0 0 0 3

Resources