Select rows based on multi-column attributes in R - r

I have to merge two datasets (spatial) for which I need to keep the row (polyline) with the most information (i.e. the longest line).
I can select the rows with the same ID as the other dataframe or not select rows with the same ID (see sample below). And reverse that operation. But I can't figure out how to choose the row of the two dataframes that has the bigger length value.
#set up sample data
sample = data.frame(x=c(1:5), length=c(1.2,1.3,1.5,7.2,36.1), ID_obj=c("a3", "4a", "5b", "8b", "a7"))
sample2 = data.frame(x=c(1:5), length=c(15.1,1.3,1.5,17.2,6.1), ID_obj=c("a3", "k6", "9c", "8b", "a7"))
#select the rows with identical values
sample[sample$ID_obj %in% sample2$ID_obj,]
sample2[sample2$ID_obj %in% sample$ID_obj,]
#select rows without duplicates in ID
'%!in%' <- function(x,y)!('%in%'(x,y))
sample[sample$ID_obj %!in% sample2$ID_obj,]
sample2[sample2$ID_obj %!in% sample$ID_obj,]
error<-rbind(sample, sample2[sample2$ID_obj %!in% sample$ID_obj,])
# x length ID_obj
#1 1 1.2 a3
#2 2 1.3 4a
#3 3 1.5 5b
#4 4 7.2 8b#keep 8b from the first set should not have been kept because length is shorter
#5 5 36.1 a7
#21 2 1.3 k6
#31 3 1.5 9c
#this is the result I want to get automatically
final<-rbind(sample[c(2, 3, 5),], sample2[c(1, 2, 3, 4),])#
# x length ID_obj
#2 2 1.3 4a
#3 3 1.5 5b
#5 5 36.1 a7#keep a7 from the first set because length is longer
#1 1 15.1 a3
#21 2 1.3 k6
#31 3 1.5 9c
#4 4 17.2 8b#keep 8b from the second set because length is longer

Use the data.table package for a simplified syntax (and better performance than data.frame):
sample = data.frame(x=c(1:5), length=c(1.2,1.3,1.5,7.2,36.1), ID_obj=c("a3", "4a", "5b", "8b", "a7"))
sample2 = data.frame(x=c(1:5), length=c(15.1,1.3,1.5,17.2,6.1), ID_obj=c("a3", "k6", "9c", "8b", "a7"))
library(data.table)
setDT(sample) # convert data.frame to data.table "in-place"
setDT(sample2)
x <- rbind(sample, sample2) # combine rows vertically
setorder(x, -length) # order by length descending
x[, head(.SD, 1), by = ID_obj] # output the first row ("head") per ID_obj group
To get the result (in a different order than your expected result):
ID_obj x length
1: a7 5 36.1
2: 8b 4 17.2
3: a3 1 15.1
4: 5b 3 1.5
5: 9c 3 1.5
6: 4a 2 1.3
7: k6 2 1.3

A bit more more cryptic with base functions, but just as an exercise:
x <- rbind(sample, sample2)
x <- x[order(x$length), ]
x <- do.call(rbind, lapply(split(x, x$ID_obj), tail, n=1))
x
# x length ID_obj
# 4a 2 1.3 4a
# 5b 3 1.5 5b
# 8b 4 17.2 8b
# a3 1 15.1 a3
# a7 5 36.1 a7
# 9c 3 1.5 9c
# k6 2 1.3 k6
Add rownames(x) <- NULL if you don't want to use ID_obj as row names.

Related

Finding distinct values across multiple columns in R

the distinct function in R generates unique values within the same column. However, I would like to have unique values regardless which column the value appears in.
The sample data is shown below.
10A appears in the second row under var 1. It appears again in the third row, although it is in var 2 this time. I would like to remove the third row since at least one of the values (10A) is a duplicate.
In the 5th row, 10B have appeared in row 2, so I would also like to remove the 5th row as at least one of the values is a duplicate.
In the 6th row, although 7A has appeared before in rows 3 and 5, rows 3 and 5 will be removed, therefore 7A is not considered a duplicate and I would like to retain row 6.
Rows 7 and 8 have NA values. NA should not be considered as duplicate so rows 7 and 8 should be retained.
How do I do it in R?
Sample data
var 1
var 2
5A
5B
10A
10B
7A
10A
6B
5C
10B
7A
10C
7A
99A
NA
NA
99B
Required Result
var 1
var 2
5A
5B
10A
10B
6B
5C
10C
7A
99A
NA
NA
99B
If df has var1 & var2 variables and you wanting to maintain only var1 distinct values:
df |>
filter(!var2 %in% unique(var1))
You can easily use recursion to acomplish this:
relation <- function(dat){
if(nrow(dat) == 1) dat
else
rbind(dat[1,],
relation(dat[!tapply(unlist(dat) %in% dat[1,], row(dat), sum),]))
}
relation(df)
var.1 var.2
1 5A 5B
2 10A 10B
4 6B 5C
6 10C 7A
Update:
The data:
dt <- read.table(header=TRUE, text = "
'var1' 'var2'
'5A' '5B2' #1
'10A' '10B' #2
'7A' '10A' #3 - to be removed
'6B' '5C' #4
'10B' '7A' #5 - to be removed
'10C' '7A' #6
'99A' 'NA' #7 - keep
'2A' '3B' #8
'NA' '99B' #9 - keep
'3A' '11B' #10")
Loop solution
looper <- function(dt) {
uniqstock <- unlist(na.omit(dt[1, 1:ncol(dt)]))
rows2keep <- TRUE # which rows to keep
# loop through data frame row by row
for(r in 2:nrow(dt)) {
rowdat <- na.omit(unlist(dt[r, ])) # na.omit to ignore NAs in row
# Are *any* values in current row a duplicate to previous ones
dupl <- any(rowdat %in% uniqstock)
rows2keep <- c(rows2keep, !dupl)
### Set all values in current row to NA (in doing so removing them from future duplicate checks)
if (!dupl)
uniqstock <- c(uniqstock, rowdat)
}
dt[rows2keep,] # return but not before removing rows with NA
}
Applier solution
applier <- function(dt) {
uniqstock <- character()
unqrows <- function(x) {
# Are *any* values in current row a duplicate to previous ones
dupl <- any(x %in% uniqstock)
# Set all values in current row to NA (in doing so removing them from future duplicate checks)
if (dupl) return(FALSE)
uniqstock <<- c(uniqstock, na.omit(x))
return(TRUE)
}
rows2keep <- apply(dt, 1, unqrows)
dt[rows2keep,]
}
Recursion solution by #onyambu with small modifications to handle NAs correctly
recursor <- function(dt) {
relation <- function(dat){
if(nrow(dat) == 1) dat
else
{
# Include <- unlist(dat) %in% dat[1,]
Include <- match(unlist(dat), dat[1,], nomatch = 0, incomparables = NA) > 0
rbind(dat[1,],
relation(dat[!tapply(Include, row(dat), sum),]))
}
}
relation(dt)
}
Benchmark. Compare the solutions’ performance, since speed is important here:
library(microbenchmark)
microbenchmark(
looper(dt), recursor(dt), applier(dt)#, check = "equivalent"
)
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> looper(dt) 369.8 393.00 512.402 404.5 420.60 9094.1 100
#> recursor(dt) 1408.5 1446.75 1725.944 1459.6 1492.75 16427.9 100
#> applier(dt) 144.8 155.60 260.622 164.3 173.65 6143.0 100
Check the results of the solutions
looper(dt)
#> var1 var2
#> 1 5A 5B2
#> 2 10A 10B
#> 4 6B 5C
#> 6 10C 7A
#> 7 99A <NA>
#> 8 2A 3B
#> 9 <NA> 99B
#> 10 3A 11B
recursor(dt) # considers NA as duplicates
#> var1 var2
#> 1 5A 5B2
#> 2 10A 10B
#> 4 6B 5C
#> 6 10C 7A
#> 7 99A <NA>
#> 8 2A 3B
#> 9 <NA> 99B
#> 10 3A 11B
applier(dt) # loses column labels
#> var1 var2
#> 1 5A 5B2
#> 2 10A 10B
#> 4 6B 5C
#> 6 10C 7A
#> 7 99A <NA>
#> 8 2A 3B
#> 9 <NA> 99B
#> 10 3A 11B
Created on 2022-06-04 by the reprex package (v2.0.1)

return a vector in a column in data.table

I have a data.table in R, and I'm looking to create a vector based on .SDcols row by row.
library("data.table")
dt = data.table(
id=1:6,
A1=sample(100,6),
A2=sample(100,6),
A3=sample(100,6),
B1=sample(100,6),
B2=sample(100,6),
B3=sample(100,6)
)
dt[,x1:=paste(.SD,collapse = ","),.SDcols=A1:B3,by=id]
dt[,x2:=strsplit(x1,",")] # x2 vector of characters
now, I got x2 with a vector of characters.
however, I expected x2 with a vector of integers.
R > dt
id A1 A2 A3 B1 B2 B3 x2
1: 1 72 23 76 10 35 14 c(72,23,76,10,35,14)
2: 2 44 28 77 29 20 63 c(44,28,77,29,20,63)
3: 3 18 34 43 77 76 100 c(18,34,43,77,76,100)
4: 4 15 33 50 87 86 86 c(15,33,50,87,86,86)
5: 5 71 71 41 75 8 3 c(71,71,41,75,8,3)
6: 6 11 89 98 42 72 27 c(11,89,98,42,72,27)
I tried with several solutions, all failed.
dt[,x2:=.(list(.SD)),.SDcols=A1:B3,by=id] #x2 is <data.table>
dt[,x2:=.(lapply(.SD,c)),.SDcols=A1:B3,by=id]
dt[,x2:=.(c(.SD)), .SDcols=A1:B3,by=id] #RHS 1 is length 6 (greater than the size (1) of group 1). The last 5 element(s) will be discarded.
dt[,x2:=c(.SD),.SDcols=A1:B3,by=id] # x2 equals A1
dt[,x2:=lapply(.SD,c),.SDcols=A1:B3,by=id] # x2 equals A1
dt[,x2:=sapply(.SD,c),.SDcols=A1:B3,by=id] # x2 equals A1
Any suggestion?
Thanks in advance
=====================================================================
edit: thanks Jaap,
dt[, x2 := lapply(strsplit(x1, ","), as.integer)] # it works
Still, I wonder any beautiful solution?
=====================================================================
edit2:
new solutions, base function is much more useful than I thought.
dt[,ABC0:=apply(rbind(.SD), 1, list),.SDcols=A1:B3,by=id]
dt[,ABC1:=apply(cbind(.SD), 1, list),.SDcols=A1:B3,by=id]
or more simple
dt[,ABC2:=lapply(.SD,rbind),.SDcols=A1:B3]

calculate reads per million mapped read using R

df1 <- read.table(text="
gene_id A1 A2 A3 A4 length Total
ENSMUSG00000000028 58 93 48 58 789 200
ENSMUSG00000000031 11 7 20 16 364 54
ENSMUSG00000000037 3 5 6 98 196 112
ENSMUSG00000000058 66 93 69 71 436 299
ENSMUSG00000000085 55 68 97 67 177 287", header=TRUE)
The table represents the read count in a gene in different samples (A1, A2..A4).
How can i calculate the reads per million mapped read (RPKM) for these raw read counts using R
RPKM = (number of reads in a gene * 1e6)/(Total*length)
out_put <- read.table(text="
gene_id A1 A2 A3 A4
ENSMUSG00000000028 367.5539 589.3536 304.1825 367.5539
ENSMUSG00000000031 559.6256 356.1254 1017.5010 814.0008
ENSMUSG00000000037 136.6618 227.7697 273.3236 4464.2857
ENSMUSG00000000058 506.2747 713.3871 529.2872 544.6289
ENSMUSG00000000085 1082.6985 1338.6090 1909.4864 1318.9236", header=TRUE)
One way to do this without writing lines or a loop is using melt and dcast:
library(reshape2)
m_df1 <- melt(df1, measure.vars=c("A1","A2","A3","A4"))
m_df1$RPKM <- with(m_df1, value*1e6 / (Total*length))
output <- dcast(gene_id~variable,value.var="RPKM",data=m_df1)
> output
gene_id A1 A2 A3 A4
1 ENSMUSG00000000028 367.5539 589.3536 304.1825 367.5539
2 ENSMUSG00000000031 559.6256 356.1254 1017.5010 814.0008
3 ENSMUSG00000000037 136.6618 227.7697 273.3236 4464.2857
4 ENSMUSG00000000058 506.2747 713.3871 529.2872 544.6289
5 ENSMUSG00000000085 1082.6985 1338.6090 1909.4864 1318.9236
A second way is to use sapply to create a matrix of estimates, which you can then either rename and add to your original data, or cbind to your gene_ids.
my_cols <- c("A1","A2","A3","A4")
RPKMs <- sapply(my_cols, function(x){
df1[,x]*1e6/(df1$Total*df1$length)
}
)
output <- cbind(df1$gene_id,RPKMs)
You can achieve this also without reshaping. Using the data.table package:
library(data.table)
setDT(df1)[,indx:=.I][, lapply(.SD, function(x) (x * 1e6) / (Total * length)),
by=.(indx,gene_id,length,Total)]
this gives:
indx gene_id length Total A1 A2 A3 A4
1: 1 ENSMUSG00000000028 789 200 367.5539 589.3536 304.1825 367.5539
2: 2 ENSMUSG00000000031 364 54 559.6256 356.1254 1017.5010 814.0008
3: 3 ENSMUSG00000000037 196 112 136.6618 227.7697 273.3236 4464.2857
4: 4 ENSMUSG00000000058 436 299 506.2747 713.3871 529.2872 544.6289
5: 5 ENSMUSG00000000085 177 287 1082.6985 1338.6090 1909.4864 1318.9236
Explanation:
with setDT(df1) you convert the dataframe to a datatable
with [,indx:=.I] you create a unique identifier for each row
with by=.(indx,gene_id,length,Total) you determine the columns by which you want to group the data (these columns will not be transformed), by including the indx you make sure that each row is an unique group
with lapply(.SD, function(x) (x * 1e6) / (Total * length)) you apply the required calculation to each column which is not specified in the by statement
A similar solution with dplyr:
library(dplyr)
func <- function(x,y,z) (x * 1e6) / (y * z)
df1 %>% mutate(indx=seq(1,nrow(.))) %>%
group_by(indx,gene_id,length,Total) %>%
summarise_each(funs(func(.,Total,length)))
wich gives:
indx gene_id length Total A1 A2 A3 A4
(int) (fctr) (int) (int) (dbl) (dbl) (dbl) (dbl)
1 1 ENSMUSG00000000028 789 200 367.5539 589.3536 304.1825 367.5539
2 2 ENSMUSG00000000031 364 54 559.6256 356.1254 1017.5010 814.0008
3 3 ENSMUSG00000000037 196 112 136.6618 227.7697 273.3236 4464.2857
4 4 ENSMUSG00000000058 436 299 506.2747 713.3871 529.2872 544.6289
5 5 ENSMUSG00000000085 177 287 1082.6985 1338.6090 1909.4864 1318.9236

Count number of occurances of a string in R under different conditions

I have a dataframe, with multiple columns called "data" which looks like this:
Preferences Status Gender
8a 8b 9a Employed Female
10b 11c 9b Unemployed Male
11a 11c 8e Student Female
That is, each customer selected 3 preferences and specified other information such as Status and Gender. Each preference is given by a [number][letter] combination, and there are c. 30 possible preferences. The possible preferences are:
8[a - c]
9[a - k]
10[a - d]
11[a - c]
12[a - i]
I want to count the number of occurrences of each preference, under certain conditions for the other columns - eg. for all women.
The output will ideally be a dataframe that looks like this:
Preference Female Male Employed Unemployed Student
8a 1034 934 234 495 203
8b 539 239 609 394 235
8c 124 395 684 94 283
9a 120 999 895 945 345
9b 978 385 596 923 986
etc.
What's the most efficient way to achieve this?
Thanks.
I am assuming you are starting with something that looks like this:
mydf <- structure(list(
Preferences = c("8a 8b 9a", "10b 11c 9b", "11a 11c 8e"),
Status = c("Employed", "Unemployed", "Student"),
Gender = c("Female", "Male", "Female")),
.Names = c("Preferences", "Status", "Gender"),
class = c("data.frame"), row.names = c(NA, -3L))
mydf
# Preferences Status Gender
# 1 8a 8b 9a Employed Female
# 2 10b 11c 9b Unemployed Male
# 3 11a 11c 8e Student Female
If that's the case, you need to "split" the "Preferences" column (by spaces), transform the data into a "long" form, and then reshape it to a wide form, tabulating while you do so.
With the right tools, this is pretty straightforward.
library(devtools)
library(data.table)
library(reshape2)
source_gist(11380733) # for `cSplit`
dcast.data.table( # Step 3--aggregate to wide form
melt( # Step 2--convert to long form
cSplit(mydf, "Preferences", " ", "long"), # Step 1--split "Preferences"
id.vars = "Preferences"),
Preferences ~ value, fun.aggregate = length)
# Preferences Employed Female Male Student Unemployed
# 1: 10b 0 0 1 0 1
# 2: 11a 0 1 0 1 0
# 3: 11c 0 1 1 1 1
# 4: 8a 1 1 0 0 0
# 5: 8b 1 1 0 0 0
# 6: 8e 0 1 0 1 0
# 7: 9a 1 1 0 0 0
# 8: 9b 0 0 1 0 1
I also tried a dplyr + tidyr approach, which looks like the following:
library(dplyr)
library(tidyr)
mydf %>%
separate(Preferences, c("P_1", "P_2", "P_3")) %>% ## splitting things
gather(Pref, Pvals, P_1:P_3) %>% # stack the preference columns
gather(Var, Val, Status:Gender) %>% # stack the status/gender columns
group_by(Pvals, Val) %>% # group by these new columns
summarise(count = n()) %>% # aggregate the numbers of each
spread(Val, count) # spread the values out
# Source: local data table [8 x 6]
# Groups:
#
# Pvals Employed Female Male Student Unemployed
# 1 10b NA NA 1 NA 1
# 2 11a NA 1 NA 1 NA
# 3 11c NA 1 1 1 1
# 4 8a 1 1 NA NA NA
# 5 8b 1 1 NA NA NA
# 6 8e NA 1 NA 1 NA
# 7 9a 1 1 NA NA NA
# 8 9b NA NA 1 NA 1
Both approaches are actually pretty quick. Test it with some better sample data than what you shared, like this:
preferences <- c(paste0(8, letters[1:3]),
paste0(9, letters[1:11]),
paste0(10, letters[1:4]),
paste0(11, letters[1:3]),
paste0(12, letters[1:9]))
set.seed(1)
nrow <- 10000
mydf <- data.frame(
Preferences = vapply(replicate(nrow,
sample(preferences, 3, FALSE),
FALSE),
function(x) paste(x, collapse = " "),
character(1L)),
Status = sample(c("Employed", "Unemployed", "Student"), nrow, TRUE),
Gender = sample(c("Male", "Female"), nrow, TRUE)
)

R - "find and replace" integers in a column with character labels

I have two data frames the first (DF1) is similar to this:
Ba Ram You Sheep
30 1 33.2 120.9
27 3 22.1 121.2
22 4 39.1 99.1
11 1 20.0 101.6
9 3 9.8 784.3
The second (DF2) contains titles for column "Ram":
V1 V2
1 RED
2 GRN
3 YLW
4 BLU
I need to replace the DF1$Ram with corresponding character strings of DF2$V2:
Ba Ram You Sheep
30 RED 33.2 120.9
27 YLW 22.1 121.2
22 BLU 39.1 99.1
11 RED 20.0 101.6
9 YLW 9.8 784.3
I can do this with a nested for loop, but it feels REALLY inefficient:
x <- c(1:nrows(DF1))
y <- c(1:4)
for (i in x) {
for (j in y) {
if (DF1$Ram[i] == x) {
DF1$Ram[i] <- DF2$V2[y]
}
}
}
Is there a way to do this more efficiently??!?! I know there is. I'm a noob.
Use merge
> result <- merge(df1, df2, by.x="Ram", by.y="V1")[,-1] # merging data.frames
> colnames(result)[4] <- "Ram" # setting name
The following is just for getting the output in the order you showed us
> result[order(result$Ba, decreasing = TRUE), c("Ba", "Ram", "You", "Sheep")]
Ba Ram You Sheep
1 30 RED 33.2 120.9
3 27 YLW 22.1 121.2
5 22 BLU 39.1 99.1
2 11 RED 20.0 101.6
4 9 YLW 9.8 784.3
Usually, when you encode some character strings with integers, you likely want factor. They offer some benefits you can read about in the fine manual.
df1 <- data.frame(V2 = c(3,3,2,3,1))
df2 <- data.frame(V1=1:4, V2=c('a','b','c','d'))
df1 <- within(df1, {
f <- factor(df1$V2, levels=df2$V1, labels=df2$V2)
aschar <- as.character(f)
asnum <- as.numeric(f)
})

Resources