how to replace values by comparing two columns in r - r

I have a dataframe looks like:
df<-read.table(text="ID RE AL
140343 TC T
200012 A G
457096 GAA GAAA
555084 AG A
557151 T TAA
752311 GAATTAAT GAAT
810001 ATTTTT ATTTT
880420 GAAAAAAAAA GAAAAAAAAAA", header=TRUE, colClasses="character")
I would like to replace the longer string in column "RE" or "AL" with letter "I", and the shorter one replaced with letter "D". if both columns have one letter, no change.
the expected result:
ID RE AL
140343 I D
200012 A G
457096 D I
555084 I D
557151 D I
752311 I D
810001 I D
880420 D I
I tried my script as:
max <- apply(df[2:3], 1, function(x) max(nchar(x)))
index <- max > 1
if(nchar(df$RE[index])==max[index]){
df$RE[index] <- "I"
df$AL[index] <- "D"
}else{
df$RE[index] <- "D"
df$AL[index] <- "I"
}

A base R vectorized solution. First line defines a subset of rows to work on. Then two lines with opposite directions for the comparison lets you choose either "D" or "I" based on the comparisons:
noneq <- with( df, (nchar(RE) != 1)|( nchar(AL) != 1) )
df[ noneq, "RE"] <- with(df[ noneq, ], c("D","I")[1+(nchar(RE) > nchar(AL) )])
df[ noneq, "AL"] <- with(df[ noneq, ], c("D","I")[1+(RE=="D" )]) # opposite of RE
df
#==============
ID RE AL
1 140343 I D
2 200012 A G
3 457096 D I
4 555084 I D
5 557151 D I
6 752311 I D
7 810001 I D
8 880420 D I

Here is a dplyr solution that may work for you
library(dplyr)
df %>%
mutate(RE = ifelse(nchar(RE) != 1 | nchar(AL) != 1,
ifelse(nchar(RE) > nchar(AL), 'I', 'D'), RE),
AL = ifelse(RE=='I', 'D', ifelse(RE=='D', 'I', AL)))
## ID RE AL
## 1 140343 I D
## 2 200012 A G
## 3 457096 D I
## 4 555084 I D
## 5 557151 D I
## 6 752311 I D
## 7 810001 I D
## 8 880420 D I

Here is a simple for loop that gets the job done:
for (i in seq(1:nrow(df))){
if(nchar(df[i, 3]) - nchar(df[i, 2]) < 0){
df[i, 3] <- "D"
df[i, 2] <- "I"
}else if(nchar(df[i, 3]) - nchar(df[i, 2]) > 0){
df[i, 3] <- "I"
df[i, 2] <- "D"
}
}

An alternative base R solution (compareble to #42- 's answer, but with pre-defining the indexes):
# create needed indexes
idx1 <- !(nchar(df$RE) == 1 & nchar(df$AL) == 1)
idx2 <- (nchar(df$RE) > nchar(df$AL)) + 1L
idx3 <- (nchar(df$RE) < nchar(df$AL)) + 1L
# replace the values
df$RE[idx1] <- c('D','I')[idx2][idx1]
df$AL[idx1] <- c('D','I')[idx3][idx1]
which gives:
> df
ID RE AL
1 140343 I D
2 200012 A G
3 457096 D I
4 555084 I D
5 557151 D I
6 752311 I D
7 810001 I D
8 880420 D I

Related

Find values in data frame 2 which is found in data frame 1, within a certain range

I want to find which values in df2 which is also present in df1, within a certain range. One value is considering both a and b in the data frames (a & b can't split up). For examples, can I find 9,1 (df1[1,1]) in df2? It doesn't have to be on the same position. Also, we can allow a diff of for example 1 for "a" and 1 for "b". For example, I want to find all values 9+-1,1+-1 in df2. "a" & "b" always go together, each row stick together. Does anyone have a suggestion of how to code this? Many many thanks!
set.seed(1)
a <- sample(10,5)
set.seed(1)
b <- sample(5,5, replace=T)
feature <- LETTERS[1:5]
df1 <- data.frame(feature,a,b)
df1
> df1
feature a b
A 9 1
B 4 4
C 7 1
D 1 2
E 2 5
set.seed(2)
a <- sample(10,5)
b <- sample(5,5, replace=T)
feature <- LETTERS[1:5]
df2 <- data.frame(feature,a,b)
df2
df2
feature a b
A 5 1
B 6 4
C 9 5
D 1 1
E 10 2
Not correct but Im imaging this can be done for a for loop somehow!
for(i in df1[,1]) {
for(j in df1[,2]){
s<- c(s,(df1[i,1] & df1[j,2]== df2[,1] & df2[,2]))# how to add certain allowed diff levels?
}
}
s
Output wanted:
feature_df1 <- LETTERS[1:5]
match <- c(1,0,0,1,0)
feature_df2 <- c("E","","","D", "")
df <- data.frame(feature_df1, match, feature_df2)
df
feature_df1 match feature_df2
A 1 E
B 0
C 0
D 1 D
E 0
I loooove data.table, which is (imo) the weapon of choice for these kind of problems..
library( data.table )
#make df1 and df2 a data.table
setDT(df1, key = "feature"); setDT(df2)
#now perform a join operation on each row of df1,
# creating an on-the-fly subset of df2
df1[ df1, c( "match", "feature_df2") := {
val = df2[ a %between% c( i.a - 1, i.a + 1) & b %between% c(i.b - 1, i.b + 1 ), ]
unique_val = sort( unique( val$feature ) )
num_val = length( unique_val )
list( num_val, paste0( unique_val, collapse = ";" ) )
}, by = .EACHI ][]
# feature a b match feature_df2
# 1: A 9 1 1 E
# 2: B 4 4 0
# 3: C 7 1 0
# 4: D 1 2 1 D
# 5: E 2 5 0
One way to go about this in Base R would be to split the data.frames() into a list of rows then calculate the absolute difference of row vectors to then evaluate how large the absolute difference is and if said difference is larger than a given value.
Code
# Find the absolute difference of all row vectors
listdif <- lapply(l1, function(x){
lapply(l2, function(y){
abs(x - y)
})
})
# Then flatten the list to a list of data.frames
listdifflat <- lapply(listdif, function(x){
do.call(rbind, x)
})
# Finally see if a pair of numbers is within our threshhold or not
m1 <- 2
m2 <- 3
listfin <- Map(function(x){
x[1] > m1 | x[2] > m2
},
listdifflat)
head(listfin, 1)
[[1]]
V1
[1,] TRUE
[2,] FALSE
[3,] TRUE
[4,] TRUE
[5,] TRUE
[6,] TRUE
[7,] TRUE
[8,] TRUE
[9,] TRUE
[10,] TRUE
Data
df1 <- read.table(text = "
4 1
7 5
1 5
2 10
13 6
19 10
11 7
17 9
14 5
3 5")
df2 <- read.table(text = "
15 1
6 3
19 6
8 2
1 3
13 7
16 8
12 7
9 1
2 6")
# convert df to list of row vectors
l1<- lapply(1:nrow(df1), function(x){
df1[x, ]
})
l2 <- lapply(1:nrow(df2), function(x){
df2[x, ]
})

'Proper' way to do row-wise replacement

I have a data frame which looks something like:
dataDemo <- data.frame(POS = 1:4 , REF = c("A" , "T" , "G" , "C") ,
ind1 = c("A" , "." , "G" , "C") , ind2 = c("A" , "C" , "C" , "."),
stringsAsFactors=FALSE)
dataDemo
POS REF ind1 ind2
1 1 A A A
2 2 T . C
3 3 G G C
4 4 C C .
and I'd like to replace all the "."s with the REF value for that row. Here is how I did it:
for(i in seq_along(dataDemo$REF)){
dataDemo[i , ][dataDemo[i , ] == '.'] <- dataDemo$REF[i]
}
I'd like to know if there's a more 'proper' or idiomatic way of doing this in R. I generally try to use *apply whenever possible and this seems like something that could easily be adapted to that approach and made more readable (and run faster), but despite throwing a good bit of time at it I haven't made much progress.
In dplyr,
library(dplyr)
dataDemo %>% mutate_each(funs(ifelse(. == '.', REF, as.character(.))), -POS)
# POS REF ind1 ind2
# 1 1 A A A
# 2 2 T T C
# 3 3 G G C
# 4 4 C C C
Here's another base R alternative, where we use the row numbers of the "." occurrences to replace them by the appropriate REF values.
# Get row numbers
rownrs <- which(dataDemo==".", arr.ind = TRUE)[,1]
# Replace values
dataDemo[dataDemo=="."] <- dataDemo$REF[rownrs]
# Result
dataDemo
# POS REF ind1 ind2
#1 1 A A A
#2 2 T T C
#3 3 G G C
#4 4 C C C
Here is an option using set from data.table, which should be fast.
library(data.table)
setDT(dataDemo)
nm1 <- paste0("ind", 1:2)
for(j in nm1){
i1 <- dataDemo[[j]]=="."
set(dataDemo, i = which(i1), j=j, value = dataDemo$REF[i1])
}
dataDemo
# POS REF ind1 ind2
#1: 1 A A A
#2: 2 T T C
#3: 3 G G C
#4: 4 C C C
EDIT: Based on #alexis_laz's comments
Or using dplyr
library(dplyr)
dataDemo %>%
mutate_each(funs(ifelse(.==".", REF,.)), ind1:ind2)
# POS REF ind1 ind2
#1 1 A A A
#2 2 T T C
#3 3 G G C
#4 4 C C C
Or we can use base R methods to do this in a single line.
dataDemo[nm1] <- lapply(dataDemo[nm1], function(x) ifelse(x==".", dataDemo$REF, x))

R: reshape data frame when one column has unequal number of entries

I have a data frame x with 2 character columns:
x <- data.frame(a = numeric(), b = I(list()))
x[1:3,"a"] = 1:3
x[[1, "b"]] <- "a, b, c"
x[[2, "b"]] <- "d, e"
x[[3, "b"]] <- "f"
x$a = as.character(x$a)
x$b = as.character(x$b)
x
str(x)
The entries in column b are comma-separated strings of characters.
I need to produce this data frame:
1 a
1 b
1 c
2 d
2 e
3 f
I know how to do it when I loop row by row. But is it possible to do without looping?
Thank you!
Have you checked out require(splitstackshape)?
> cSplit(x, "b", ",", direction = "long")
a b
1: 1 a
2: 1 b
3: 1 c
4: 2 d
5: 2 e
6: 3 f
> s <- strsplit(as.character(x$b), ',')
> data.frame(value=rep(x$a, sapply(s, FUN=length)),b=unlist(s))
value b
1 1 a
2 1 b
3 1 c
4 2 d
5 2 e
6 3 f
there you go, should be very fast:
library(data.table)
x <- data.table(x)
x[ ,strsplit(b, ","), by = a]

How can a change variable name within for loop to get lists("divide") corresponding to 'slide1' and 'slide2'

Suppose have a dataframe like this :-
df<- read.table(text="groups names
1 a
1 b
1 c
1 d
2 e
2 f
2 g
2 h
", header=T)
I divided this data frame into two groups by using
split_groups <-split(df, df$groups)
Then I used for loop to obtain the overlapping lists of split_group[[1]] and split_group[[2]] as follows:
slide <- list()
for(i in 1:2){
slide[[i]] <- rollapply(split_groups[[i]][,2], width =2,by=1, matrix, align="right")
}
And obtained this :-
slide[[1]]:
a
b
**b**
c
**c**
d
slide[[2]] :
e
f
**f**
g
**g**
h
then I divided slide[[1]] and slide[[2]] into lists of equal rows:
divide <- split(slide[[1]], cumsum(seq_len(nrow(slide[[1]])) %%2 == 1))
and obtained divide[[1]] = a,b ; divide[[2]] = b,c and so on.
Similarly from slide[[2]], divide[[1]] = e,f and so on.
I want to rbind divide[[1]] from split[[1]] and split[[2]] ie set1 = a,b,e,f in the form of list or dataframe.
Similarly divide[[2]] from split[[1]] and split[[2]] ie set2= b,c,f,g.
ie
set1:
a
b
e
f
set2:
b
c
f
g
How can I do this ?
May be you want this: (The slide output is different than it was showed in the post)
divide1 <- split(slide[[1]], cumsum(seq_len(nrow(slide[[1]])) %%2 == 1))
divide2 <- split(slide[[2]], cumsum(seq_len(nrow(slide[[2]])) %%2 == 1))
nm1 <- paste0("set", 1:2)
Map(function(x,y,z) setNames(data.frame(c(x,y)),z), divide1, divide2, nm1)
#$`1`
# set1
#1 a
#2 b
#3 e
#4 f
#$`2`
# set2
#1 b
#2 f
Or if you have more list elements in slide, you could do:
divide <- lapply(slide, function(x) split(x, cumsum(!!seq_len(nrow(x)) %%2)))
divN <- unlist(divide)
lstN <- split(unname(divN), substr(names(divN),1,1))
nm1 <- paste0("set", seq_along(lstN))
Map(function(x,y) setNames(data.frame(x),y), lstN, nm1)
#$`1`
# set1
#1 a
#2 b
#3 e
#4 f
#$`2`
# set2
#1 b
#2 f

Group columns with the same name in R

If I have a data frame as below, with the first row the column names (row names not included here)
A B C D E F G H I
a b c a a b c c c
1 2 3 4 5 6 7 8 9
How would I be able create a new data frame such that:
a b c
1 2 3
4 6 7
5 NA 8
NA NA 9
Notice the NA. For empty values.
UPDATE
If d.frame is the dataframe in question:
new.df <- data.frame();
firstrow <- d.frame[,1]
names <- unique(firstrow)
for (n in names) {
#cbind.fill is part of a package plyr
new.df <- cbind.fill(new.df, frame[3,which(firstrow == n)])
}
colnames(new.df) <- names;
I think that works well. But it isn't efficient and relies on a third party package. Any suggestions?
Here is another solution, based on function cbind.fill from cbind a df with an empty df (cbind.fill?)
cbind.fill<-function(...){
nm <- list(...)
nm<-lapply(nm, as.matrix)
n <- max(sapply(nm, nrow))
do.call(cbind, lapply(nm, function (x)
rbind(x, matrix(, n-nrow(x), ncol(x)))))
}
df <- read.table(text = "A B C D E F G H I
a b c a a b c c c
1 2 3 4 5 6 7 8 9", header = T, as.is=T)
df <- as.matrix(df)
do.call(cbind.fill, split(df[2,], df[1,]))
And another one solution
df <- as.matrix(df)
lst <- split(df[2,], df[1,])
m <- max(sapply(lst, length))
result <- sapply(lst, function(x) {length(x) <- m; x})
Couldn't find a simple solution for this, so here's one option using base R as you requested in comments. This solution will work no matter how many columns you have in the original data
temp <- read.table(text = "A B C D E F G H I
a b c a a b c c c
1 2 3 4 5 6 7 8 9", header = T) # your data
temp <- data.frame(t(temp))
lengths <- table(temp[, 1])
maxval <- max(lengths)
data.frame(do.call(cbind, lapply(levels(temp[, 1]), function(x) c(x, temp[temp[, 1] == x, 2], rep(NA, maxval - lengths[x])))))
## X1 X2 X3
## 1 a b c
## 2 1 2 3
## 3 4 6 7
## 4 5 <NA> 8
## 5 <NA> <NA> 9
I would transpose the original two-row data.frame, create a "time" variable, use reshape to reorganize the data, and transpose the result.
Like this:
x <- t(mydf)
y <- data.frame(cbind(x, ave(x[, 1], x[, 1], FUN = seq_along)))
t(reshape(y, direction = "wide", idvar = "X1", timevar = "X3"))
# A B C
# X1 "a" "b" "c"
# X2.1 "1" "2" "3"
# X2.2 "4" "6" "7"
# X2.3 "5" NA "8"
# X2.4 NA NA "9"

Resources