Replacing zero and one in data frame in R with letters - r

I have a data frame (df) which looks like this:
df$a df$b
T C 1 1 1 1
A G 0 1 1 0
C G 0 0 1 1
A T 0 0 0 0
I would like to have this output:
T C C C C C
A G A G G A
C G C C G G
A T A A A A
Basically if it is zero it takes the letter in column df$a while if it is 1 it takes the letter in column df$b and the zeros and ones are replaced.
Anybody knows how to do this in R?

You can do this with base functionality - note that I read the data in columns 1 and 2 as characters, not factors.
df <- read.table(text = "
T C 1 1 1 1
A G 0 1 1 0
C G 0 0 1 1
A T 0 0 0 0", header = FALSE, stringsAsFactors = FALSE)
df[, 3:ncol(df)] <- sapply(df[, 3:ncol(df)], function(x) ifelse(x == 1, df[, 2], df[, 1]))
df
#> V1 V2 V3 V4 V5 V6
#> 1 T C C C C C
#> 2 A G A G G A
#> 3 C G C C G G
#> 4 A T A A A A
Created on 2019-01-25 by the reprex package (v0.2.1)

df[, -(1:2)] <- ifelse(df[, -(1:2)] == 1, df[, 2], df[, 1])

Here is a base R solution.
df[-(1:2)] <- t(apply(df, 1, function(x) {
y <- as.numeric(x[-(1:2)])
x[1:2][y + 1]
}))
df
# a b c d e f
#1 T C C C C C
#2 A G A G G A
#3 C G C C G G
#4 A T A A A A
Data.
df <- read.table(text = "
a b c d e f
T C 1 1 1 1
A G 0 1 1 0
C G 0 0 1 1
A T 0 0 0 0
", header = TRUE)

Related

Detecting key words across multiple columns and flagging them each in new columns

I have this type of data:
set.seed(123)
df <- data.frame(
v1 = sample(LETTERS[1:10], 5),
v2 = sample(LETTERS[1:10], 5),
v3 = sample(LETTERS[1:10], 5),
v4 = sample(LETTERS[1:10], 5)
)
as well as a number of key words:
keys <- c("A", "C", "F", "H")
I want to (i) detect which keyword is present in each column of df and (ii) create new columns for each of the keys, recording with 1if that key is present. I've been using a for loop so far, which, however, does not work:
library(stringr)
for(i in keys){
df[i] <- +str_detect(apply(df, 1, paste0, collapse = " "), keys[i])
}
df
v1 v2 v3 v4 A C F H
1 C A J I NA NA NA NA
2 H E E C NA NA NA NA
3 D H F A NA NA NA NA
4 G D I J NA NA NA NA
5 F C A F NA NA NA NA
I don't know why it doesn't work, as the same logic applied to a single key does work:
+str_detect(apply(df, 1, paste0, collapse = " "), keys[1])
[1] 1 0 1 0 1
I'd be grateful for advice on how the for loop must be tweaked but also for an entirely different solution to the problem (e.g., dplyr).
A base solution:
cbind(df, +sapply(keys, grepl, x = do.call(paste, df)))
# v1 v2 v3 v4 A C F H
# 1 C E J I 0 1 0 0
# 2 J D E J 0 0 0 0
# 3 B F C E 0 1 1 0
# 4 H H H C 0 1 0 1
# 5 F A A B 1 0 1 0
In your for loop you are indexing on both the index and the name. Test keys["A"], it'll return NA. You can use which to avoid this:
library(stringr)
for(i in keys){
df[i] <- ifelse(str_detect(apply(df, 1, paste0, collapse = " "), keys[which(keys == i)]), 1, 0)
}
output
v1 v2 v3 v4 A C F H
1 C E J I 0 1 0 0
2 J D E J 0 0 0 0
3 B F C E 0 1 1 0
4 H H H C 0 1 0 1
5 F A A B 1 0 1 0
Another option would be to index on the position (1, 2, 3, 4):
for(i in seq_along(keys)){
df[keys[i]] <- ifelse(str_detect(apply(df, 1, paste0, collapse = " "), keys[i]), 1, 0)
}
Using dplyr's if_any:
library(stringr)
library(dplyr)
library(purrr)
bind_cols(df,
map_dfc(keys, \(x) df %>%
transmute(!!(x) := +(if_any(v1:v4, \(y) str_detect(x, y))))))
output
v1 v2 v3 v4 A C F H
1 C E J I 0 1 0 0
2 J D E J 0 0 0 0
3 B F C E 0 1 1 0
4 H H H C 0 1 0 1
5 F A A B 1 0 1 0

convert data.frame to another data.frame

I have two data.frames as follows:
dt2017 = data.frame(id=LETTERS[1:5],year=2017,city1=c(0,1,0,1,0),city2=c(0,0,1,0,0),city3=c(1,0,1,0,1),city4=c(0,0,0,0,1))
dt2017
id year city1 city2 city3 city4
1: A 2017 0 0 1 0
2: B 2017 1 0 0 0
3: C 2017 0 1 1 0
4: D 2017 1 0 0 0
5: E 2017 0 0 1 1
dt2016 = data.frame(id=LETTERS[1:5],year=2016,city1=c(0,0,0,0,1),city2=c(0,0,0,1,0),city3=c(0,0,1,0,1),city4=c(1,1,0,0,1))
dt2016
id year city1 city2 city3 city4|
1: A 2016 0 0 0 1
2: B 2016 0 0 0 1
3: C 2016 0 0 1 0
4: D 2016 0 1 0 0
5: E 2016 1 0 1 1
"1" in the data.frame can represent working in the city. For example, in 2016, A,B and E work in the same city4. First, I want to get following data.frame:
id 2016 2017 2016+2017
1: A B;E C;E B;C;E
2: B A;E D A;D;E
3: C E A;E A;E
4: D NA B B
5: E A;B;C A;C A;B;C
Second, I want to get a data.frame like this:
id relation
A B
A C
A E
B A
B D
B E
D B
E A
E B
E C
Any suggestions would be greatly appreciated.
I have found a way to achieve what you want, but it is not very pretty. Still it does what you want.
library(plyr)
dt2017$id <- as.character(dt2017$id)
dt2016$id <- as.character(dt2016$id)
id <- dt2017$id
my_function <- function(dt, x){
tmp <- data.frame(id=id, dt[,dt[dt$id==x,]==1])
tmp$ind <- sapply(1:nrow(tmp), function(x) return(sum(tmp[x, 2:ncol(tmp)])))
return(paste(tmp[tmp$ind > 0 & tmp$id !=x,"id"], collapse=";"))
}
results1 <- data.frame(Year2016 = sapply(id, function(x) return(my_function(dt2016,x))),
Year2017 = sapply(id, function(x) return(my_function(dt2017,x))))
my_function2 <- function(dt, x){
tmp <- data.frame(id=id, dt[,dt[dt$id==x,]==1])
tmp$ind <- sapply(1:nrow(tmp), function(x) return(sum(tmp[x, 2:ncol(tmp)])))
if(length(tmp[tmp$ind > 0 & tmp$id !=x,"id"])!=0){
return(data.frame(id=x, relation=tmp[tmp$ind > 0 & tmp$id !=x,"id"]))
}
}
results_tmp <- rbind(adply(.data=id, .margins=1, .fun= function(x) return(my_function2(dt2016,x))),
adply(.data=id, .margins=1, .fun= function(x) return(my_function2(dt2017,x))))[, c("id", "relation")]
results2 <- unique(results_tmp[order(as.character(results_tmp$id)),])
fun_tmp <- function(x) return(paste(x, collapse=";"))
bothyear_tmp <- aggregate(list(relation=results2$relation), by=list(id=results2 $"id"), FUN=fun_tmp)
results1$BothYear <- bothyear_tmp[order(as.character(bothyear_tmp$id)),"relation"]
And here are the results:
results1
Year2016 Year2017 BothYear
A B;E C;E B;E;C
B A;E D A;E;D
C E A;E E;A
D B B
E A;B;C A;C A;B;C
results2
id relation
A B
A E
A C
B A
B E
B D
C E
C A
D B
E A
E B
E C

Cross comparison of columns of the same data.frame

I have a data.frame that looks like this:
> DF1
A B C D E
a x c h p
c d q t w
s e r p a
w l t s i
p i y a f
I would like to compare each column of my data.frame with the remaining columns in order to count the number of common elements. For example, I would like to compare column A with all the remaining columns (B, C, D, E) and count the common entities in this way:
A versus the remaining:
A vs B: 0 (because they have 0 common elements)
A vs C: 1 (c in common)
A vs D: 2 (p and s in common)
A vs E: 3 (p,w,a, in common)
Then the same: B versus C,D,E columns and so on.
How can I implement this?
We can loop through the column names and compare with the other columns, by taking the intersect and get the length
sapply(names(DF1), function(x) {
x1 <- lengths(Map(intersect, DF1[setdiff(names(DF1), x)], DF1[x]))
c(x1, setNames(0, setdiff(names(DF1), names(x1))))[names(DF1)]})
# A B C D E
#A 0 0 1 3 3
#B 0 0 0 0 1
#C 1 0 0 1 0
#D 3 0 1 0 2
#E 3 1 0 2 0
Or this can be done more compactly by taking the cross product after getting the frequency of the long formatted (melt) dataset
library(reshape2)
tcrossprod(table(melt(as.matrix(DF1))[-1])) * !diag(5)
# Var2
#Var2 A B C D E
# A 0 0 1 3 3
# B 0 0 0 0 1
# C 1 0 0 1 0
# D 3 0 1 0 2
# E 3 1 0 2 0
NOTE: The crossprod part is also implemented with RcppEigen here which would make this faster
An alternative is to use combn twice, once to get the column combinations and next to find the lengths of the element intersections.
cbind.data.frame returns a data.frame and setNames is used to add column names.
setNames(cbind.data.frame(t(combn(names(df), 2)),
combn(names(df), 2, function(x) length(intersect(df[, x[1]], df[, x[2]])))),
c("col1", "col2", "count"))
col1 col2 count
1 A B 0
2 A C 1
3 A D 3
4 A E 3
5 B C 0
6 B D 0
7 B E 1
8 C D 1
9 C E 0
10 D E 2

Convert list of individuals to occurence of pairs in R

I need specific format of data.frame for social structure analysis. How to convert data.frame containing list of individuals occuring together on multiple events:
my.df <- data.frame(individual = c("A","B","C","B","C","D"),
time = rep(c("event_01","event_02"), each = 3))
individual time
1 A event_01
2 B event_01
3 C event_01
4 B event_02
5 C event_02
6 D event_02
into a data.frame containing occurence for each pairs (including [A,A]; [B,B] etc. pairs:
ind_1 ind_2 times
A A 0
A B 1
A C 1
A D 0
B A 1
B B 0
B C 2
B D 1
C A 1
C B 2
C C 0
C D 1
D A 0
D B 1
D C 1
D D 0
In base R, you could do the following:
data.frame(as.table(`diag<-`(tcrossprod(table(my.df)), 0)))
# individual individual.1 Freq
# 1 A A 0
# 2 B A 1
# 3 C A 1
# 4 D A 0
# 5 A B 1
# 6 B B 0
# 7 C B 2
# 8 D B 1
# 9 A C 1
# 10 B C 2
# 11 C C 0
# 12 D C 1
# 13 A D 0
# 14 B D 1
# 15 C D 1
# 16 D D 0
tcrossprod gives you the following:
> tcrossprod(table(my.df))
individual
individual A B C D
A 1 1 1 0
B 1 2 2 1
C 1 2 2 1
D 0 1 1 1
That's essentially all the information you are looking for, but you want it in a slightly different form, without the diagonal values.
We can set the diagonals to zero with:
`diag<-`(theOutputFromAbove, 0)
Then, to get the long form, trick R into thinking that the resulting matrix is a table by using as.table, and make use of the data.frame method for tables.
You can do:
create the first 2 variables of the new data.frame:
df2 <- expand.grid(ind_2=levels(my.df$individual), ind_1=levels(my.df$individual))[, 2:1]
Put the value to 0 for the pairs of same individuals:
df2$times[df2[, 1]==df2[, 2]] <- 0
See the other unique combinations:
comb_diff <- combn(levels(my.df$individual), 2)
compute the times each unique combination is found together:
times_uni <- apply(comb_diff, 2, function(inds){
sum(table(my.df$time[my.df$individual %in% inds])==2)
})
Finally, fill the new data.frame:
df2$times[match(c(paste0(comb_diff[1,], comb_diff[2,]), paste0(comb_diff[2, ], comb_diff[1, ])), paste0(df2[, 1],df2[, 2]))] <- rep(times_uni, 2)
df2
# ind_1 ind_2 times
#1 A A 0
#2 A B 1
#3 A C 1
#4 A D 0
#5 B A 1
#6 B B 0
#7 B C 2
#8 B D 1
#9 C A 1
#10 C B 2
#11 C C 0
#12 C D 1
#13 D A 0
#14 D B 1
#15 D C 1
#16 D D 0
You can do it using data.table
dt_combs <- my.dt[,
list(ind_1 = combn(individual, 2)[1, ],
ind_2 = combn(individual, 2)[2, ]),
by = time]
dt_ncombs <- dt_combs[, .N, by = c("ind_1", "ind_2")]
dt_ncombs_inverted <- copy(dt_ncombs)
dt_ncombs_inverted[, temp := ind_1]
dt_ncombs_inverted[, ind_1 := ind_2]
dt_ncombs_inverted[, ind_2 := temp]
dt_ncombs_inverted[, temp := NULL]
dt_ncombs <- rbind(dt_ncombs, dt_ncombs_inverted)
dt_allcombs <- data.table(expand.grid(
ind_1 = my.dt[, unique(individual)],
ind_2 = my.dt[, unique(individual)]
))
dt_final <- merge(dt_allcombs,
dt_ncombs,
all.x = TRUE,
by = c("ind_1", "ind_2"))
dt_final[is.na(N), N := 0]
dt_final

How can I add and calculate a column's value in R

This is the first part of my code:
BSum=0.0
mydata = NULL
while(BSum < 5)
{
A=(rpois (1, lambda=1))
y=runif(A,0,1)
B1 = length(which(y<=0.5))
BSum = BSum + B1
C = A - B1
mydata=rbind(mydata,c("A"=A,"B"=B1,"C"=C))
}
I need 3 more columns here. For column D(row x) I would generate as many random nos. (between 0 and 1) as is the value in Column B(row x). Then I see how many of those random nos. are less than or equal to 0.1. I put the total count of these in Column D. The remainder (B-D) becomes column F. I generate another column E that will get populated the same same way D was generated from B. The remainder again gets added to what had accumulated in Column F.
:= is from the data.table package. As you don't have this loaded, either your object isn't a data.table object or it is but you don't say and the package was not loaded.
If the former (your object is a data frame, not a data.table) then you want cbind(). As in:
set.seed(1)
df <- data.frame(A = runif(10))
cbind(df, list(B = runif(10), C = letters[1:10]))
> cbind(df, list(B = runif(10), C = letters[1:10]))
A B C
1 0.26550866 0.2059746 a
2 0.37212390 0.1765568 b
3 0.57285336 0.6870228 c
4 0.90820779 0.3841037 d
5 0.20168193 0.7698414 e
6 0.89838968 0.4976992 f
7 0.94467527 0.7176185 g
8 0.66079779 0.9919061 h
9 0.62911404 0.3800352 i
10 0.06178627 0.7774452 j
For your particular problem, try:
myfun <- function(z) {
ret1 <- apply(z, 1, function(x) sum(runif(x) <= 0.1))
ret2 <- z[,1] - ret1
cbind(z, B = ret1, C = ret2)
}
set.seed(1)
df <- data.frame(A = rpois(10, 2))
myfun(df)
> myfun(df)
A B C
1 1 0 1
2 1 0 1
3 2 0 2
4 4 0 4
5 1 0 1
6 4 0 4
7 4 1 3
8 2 0 2
9 2 0 2
10 0 0 0
You could make this more efficient, say by not doing each row individually, but it'd involve more coding.
Updated
If I understand your update (and I might not as I already showed you how to do those steps, though not in the same configuration as you now want), then I think this is what you wanted. Note that how you create E is a little ambiguous. I took you literally and just did exactly the same as for D.
set.seed(2)
BSum <- 0.0
mydata <- NULL
while(BSum < 5) {
A <- rpois(1, lambda = 1)
B1 <- sum(runif(A, 0, 1) <= 0.5)
BSum <- BSum + B1
C <- A - B1
D <- sum(runif(B1) <= 0.1)
F <- B1 - D
E <- sum(runif(B1) <= 0.1)
F <- F + (D - E)
mydata <- rbind(mydata, c(A = A, B = B1, C = C, D = D, E = E, F = F))
}
With that seed I get
R> mydata
A B C D E F
[1,] 0 0 0 0 0 0
[2,] 1 0 1 0 0 0
[3,] 0 0 0 0 0 0
[4,] 3 1 2 0 0 1
[5,] 1 1 0 0 0 1
[6,] 1 0 1 0 0 0
[7,] 3 3 0 0 0 3

Resources