Compare rows by different columns in a dataframe - r

I have a dataframe looking like this:
id value1 value2 value3 value4
A 14 24 22 9
B 51 25 29 33
C 4 16 8 10
D 1 4 2 4
Now I want to compare each column of the row with the others rows in order to identify the rows where every value is higher.
So, for example for id D this would be A, B and C.
For C it would be B, for A it's B and for B there is no row.
I tried to do that by looping through the rows and comparing every column, but that takes a lot of time. The original dataset has about 5000 rows and 20 columns to compare.
I am sure that there is a way to do that more efficiently. Thanks for your help!

I don't know a simple function to do this task.
Here is how I would do.
library(dplyr)
DF <- data.frame(
id = c("A", "B", "C", "D"),
value1 = c(14, 51, 4, 1),
value2 = c(24, 25, 16, 4),
value3 = c(22, 29, 8, 2),
value4 = c(9, 33, 10, 4),
stringsAsFactors = FALSE)
# get the order for each value
tmp <- lapply(select(DF, -id), function(x) DF$id[order(x)])
# find a set of "biggers" for each id
tmp <- lapply(tmp, function(x) data.frame(
id = rep(x, rev(seq_along(x))-1),
bigger = x[lapply(seq_along(x), function(i)
which(seq_along(x) > i)) %>% unlist()],
stringsAsFactors = FALSE))
# inner_join all, this keeps "biggers" in all columns
out <- NULL
for (v in tmp) {
if (is.null(out)) {
out <- v
} else {
out <- inner_join(out, v, by = c("id", "bigger"))
}
}
This gets you:
out
# id bigger
#1 D C
#2 D A
#3 D B
#4 C B
#5 A B

Here's an approach that returns results in a data frame format.
library(tidyr)
library(dplyr)
# reshape data to long format
td <- d %>% gather(key, value, value1:value4)
# create a copy w/ different names for merging
td2 <- td %>% select(id2 = id, key, value2 = value)
# full outer join to produce one row per pair of IDs
dd <- merge(td, td2, by = "key", all = TRUE)
# the result
dd %>%
filter(id != id2) %>%
group_by(id, id2) %>%
summarise(all_less = !any(value >= value2)) %>%
filter(all_less)
results (id is less than id2)
id id2 all_less
(fctr) (fctr) (lgl)
1 A B TRUE
2 C B TRUE
3 D A TRUE
4 D B TRUE
5 D C TRUE
data
d <- structure(list(
id = structure(1:4, .Label = c("A", "B", "C", "D"), class = "factor"),
value1 = c(14L, 51L, 4L, 1L),
value2 = c(24L, 25L, 16L, 4L),
value3 = c(22L, 29L, 8L, 2L), value4 = c(9L, 33L, 10L, 4L)
),
.Names = c("id", "value1", "value2", "value3", "value4"),
class = "data.frame", row.names = c(NA, -4L)
)

I think this works just fine:
ind <- which(names(df) == "id")
apply(df[,-ind],1,function(x) df$id[!rowSums(!t(x < t(df[,-ind])))] )
# [[1]]
# [1] "B"
#
# [[2]]
# character(0)
#
# [[3]]
# [1] "B"
#
# [[4]]
# [1] "A" "B" "C"

Related

Verifyin if there's at least two columns have the same value in a specefic column

i have a data and i want to see if my variables they all have unique value in specefic row
let's say i want to analyze row D
my data
Name F S T
A 1 2 3
B 2 3 4
C 3 4 5
D 4 5 6
> TRUE (because all the three variables have unique value)
Second example
Name F S T
A 1 2 3
B 2 3 4
C 3 4 5
D 4 5 4
>False (because F and T have the same value in row D )
In base R do
f1 <- function(dat, ind) {
tmp <- unlist(dat[ind, -1])
length(unique(tmp)) == length(tmp)
}
-testing
> f1(df, 4)
[1] TRUE
> f1(df1, 4)
[1] FALSE
data
df <- structure(list(Name = c("A", "B", "C", "D"), F = 1:4, S = 2:5,
T = 3:6), class = "data.frame", row.names = c(NA, -4L))
df1 <- structure(list(Name = c("A", "B", "C", "D"), F = 1:4, S = 2:5,
T = c(3L, 4L, 5L, 4L)), class = "data.frame", row.names = c(NA,
-4L))
You can use dplyr for this:
df %>%
summarize_at(c(2:ncol(.)), n_distinct) %>%
summarize(if_all(.fns = ~ .x == nrow(df)))

How fill a dataframe from another one in R?

I want to fill df2 with information from df1.
df1 as below
ID Mutation
1 A
2 B
2 C
3 A
df2 as below
ID A B C
1
2
3
For example, if mutation A is found in ID 1, then I want it in df2 it marked as "Y".
So the df2 result should be
ID A B C
1 Y
2 Y Y
3 Y
I have hundreds of IDs and more than 20 mutations. How can I efficiently achieve this in R? Thanks!
Using data.table you can try
setDT(df)
df2 <- dcast(df,formula = ID~Mutation )
df2[, c("A", "B", "C") := lapply(.SD, function(x) ifelse(is.na(x), " ", "Y")), ID]
df2
#Output
ID A B C
1: 1 Y
2: 2 Y Y
3: 3 Y
Create a new column with value 'Y' and cast the data in wide format.
library(dplyr)
library(tidyr)
df %>%
mutate(value = 'Y') %>%
pivot_wider(names_from = Mutation, values_from = value, values_fill = '')
# ID A B C
# <int> <chr> <chr> <chr>
#1 1 "Y" "" ""
#2 2 "" "Y" "Y"
#3 3 "Y" "" ""
data
df <- structure(list(ID = c(1L, 2L, 2L, 3L), Mutation = c("A", "B",
"C", "A")), class = "data.frame", row.names = c(NA, -4L))

Calculating sum of certain values across two columns in R

I currently have a dataframe like the one below of a bunch of pairwise correlations:
Data
structure(list(ID1 = c("A", "A", "A", "B", "B", "C"), ID2 = c("B",
"C", "D", "C", "D", "D"), cor = c(0.6, 0.6, 0.2, 0.1, 0.9, 0.2
), value1 = c(50L, 50L, 50L, 20L, 20L, 30L), value2 = c(20L,
30L, 100L, 30L, 100L, 100L)), class = "data.frame", row.names = c(NA,
-6L))
ID1 ID2 cor value1 value2
1 A B 0.6 50 20
2 A C 0.6 50 30
3 A D 0.2 50 100
4 B C 0.1 20 30
5 B D 0.9 20 100
6 C D 0.2 30 100
I'm trying to get the sum of all IDs (i.e. B) of the product between cor and either value1 or value2 depending on whether it is from ID1 or ID2.
For instance, the sum of B would be (cor x value)
(0.6 x 50) + (0.1 x 30) + (0.9 x 100)
I essentially would need to do this for around 20000 unique IDs. I hope this makes sense. I'm not that great in R (yet)!
Does this achieve what you need?
library(tidyverse)
df2 <- df %>%
pivot_longer(names_to = "names", values_to = "values", -c(cor:value2)) %>%
mutate(value = if_else(names == "ID1", value2, value1),
sum = cor * value) %>%
group_by(values) %>%
summarise(sum = sum(sum))
Unless you're looking for dplyr way of answering it, here's a quick but a little inelegant way of doing it:
cond1 <- df$ID1[df$ID1 == "B"]
sum1 <- sum(df$cor[cond1] * df$value1[cond1])
cond2 <- df$ID2[df$ID2 == "B"]
sum2 <- sum(df$cor[cond2] * df$value2[cond2])
finalsum = sum1 + sum2
Basically you want to first look at which row B is in ID1, and then do the product-sum, and then look at which row B is in ID2 and do the same.
Update:
What if you have thousands of ID? Again, I like it quick so create a function out of it:
prodsum <- function (df, ID) {
cond1 <- df$ID1[df$ID1 == ID]
sum1 <- sum(df$cor[cond1] * df$value1[cond1])
cond2 <- df$ID2[df$ID2 == ID]
sum2 <- sum(df$cor[cond2] * df$value2[cond2])
return(sum1 + sum2)
}
Then prodsum(df, "B") will give you the answer for original question. And you can use sapply() to do the job of cycling through thousands of IDs:
IDs <- unique(c(df$ID1, df$ID2))
sapply(IDs, function (x) prodsum(df, x)
There may or may not be a problem if an ID exists exclusively in ID1 or ID2. I'm sure you can write a conditional to deal with the problem.
Another way of looking the thing as below.
Assuming your data frame name is a
a1 <- subset(a,select=c(ID1,cor,value1))
a2 <- subset(a,select=c(ID2,cor,value1))
colnames(a2)[colnames(a2) == "ID2"] <- "ID1"
a3 <- rbind(a1,a2)
a3$MULTIPLY1 <- a3$cor * a3$value1
a4 <- a3 %>% group_by(ID1) %>% summarise(FINALVALUE = sum(MULTIPLY1))
# A tibble: 4 x 2
ID1 FINALVALUE
<chr> <dbl>
1 A 70
2 B 50
3 C 38
4 D 34
Hope this will help to some extent...!

How to extract a column based on column name?

I have a data frame df
m n o p
a 1 1 2 5
b 1 2 0 4
c 3 3 3 3
I can extract column m by:
df[,"m"]
Now the problem is, the column name was generated somewhere else (multiple times, in a for loop). For example, column name m was generated by choosing a specific element in the dataframe, gen, in one loop
:
> gen[i,1]
[1] m
How do I extract the column based on gen[i,1]?
Just nest the subsetting.
dat[,"m"]
# [1] 1 1 3
i <- 13
gen[i, 1]
# [1] "m"
dat[, gen[i, 1]]
# [1] 1 1 3
Or, if you don't want the column to be dropped:
dat[, gen[i, 1], drop=FALSE]
# m
# a 1
# b 1
# c 3
Data
dat <- structure(list(m = c(1L, 1L, 3L), n = 1:3, o = c(2L, 0L, 3L),
p = 5:3), class = "data.frame", row.names = c("a", "b", "c"
))
gen <- data.frame(letters)
We can use select from dplyr
library(dplyr)
i <- 13
dat %>%
select(gen[i, 1])
# m
#a 1
#b 1
#c 3
data
dat <- structure(list(m = c(1L, 1L, 3L), n = 1:3, o = c(2L, 0L, 3L),
p = 5:3), class = "data.frame", row.names = c("a", "b", "c"
))
gen <- data.frame(letters)

How to sanitize a df according to specific variable values?

I have two data frames. dfOne is made like this:
X Y Z T J
3 4 5 6 1
1 2 3 4 1
5 1 2 5 1
and dfTwo is made like this
C.1 C.2
X Z
Y T
I want to obtain a new dataframe where there are simultaneously X, Y, Z, T Values which are major than a specific threshold.
Example. I need simultaneously (in the same row):
X, Y > 2
Z, T > 4
I need to use the second data frame to reach my objective, I expect something like:
dfTwo$C.1>2
so the result would be a new dataframe with this structure:
X Y Z T J
3 4 5 6 1
How could I do it?
Here is a base R method with Map and Reduce.
# build lookup table of thresholds relative to variable name
vals <- setNames(c(2, 2, 4, 4), unlist(dat2))
# subset data.frame
dat[Reduce("&", Map(">", dat[names(vals)], vals)), ]
X Y Z T J
1 3 4 5 6 1
Here, Map returns a list of length 4 with logical variables corresponding to each comparison. This list is passed to Reduce which returns a single logical vector with length corresponding to the number of rows in the data.frame, dat. This logical vector is used to subset dat.
data
dat <-
structure(list(X = c(3L, 1L, 5L), Y = c(4L, 2L, 1L), Z = c(5L,
3L, 2L), T = c(6L, 4L, 5L), J = c(1L, 1L, 1L)), .Names = c("X",
"Y", "Z", "T", "J"), class = "data.frame", row.names = c(NA,
-3L))
dat2 <-
structure(list(C.1 = structure(1:2, .Label = c("X", "Y"), class = "factor"),
C.2 = structure(c(2L, 1L), .Label = c("T", "Z"), class = "factor")), .Names = c("C.1",
"C.2"), class = "data.frame", row.names = c(NA, -2L))
We can use the purrr package
Here is the input data.
# Data frame from lmo's solution
dat <-
structure(list(X = c(3L, 1L, 5L), Y = c(4L, 2L, 1L), Z = c(5L,
3L, 2L), T = c(6L, 4L, 5L), J = c(1L, 1L, 1L)), .Names = c("X",
"Y", "Z", "T", "J"), class = "data.frame", row.names = c(NA,
-3L))
# A numeric vector to show the threshold values
# Notice that columns without any requirements need NA
vals <- c(X = 2, Y = 2, Z = 4, T = 4, J = NA)
Here is the implementation
library(purrr)
map2_dfc(dat, vals, ~ifelse(.x > .y | is.na(.y), .x, NA)) %>% na.omit()
# A tibble: 1 x 5
X Y Z T J
<int> <int> <int> <int> <int>
1 3 4 5 6 1
map2_dfc loop through each column in dat and each value in vals one by one with a defined function. ~ifelse(.x > .y | is.na(.y), .x, NA) means if the number in each column is larger than the corresponding value in vals, or vals is NA, the output should be the original value from the column. Otherwise, the value is replaced to be NA. The output of map2_dfc(dat, vals, ~ifelse(.x > .y | is.na(.y), .x, NA)) is a data frame with NA values in some rows indicating that the condition is not met. Finally, na.omit removes those rows.
Update
Here I demonstrate how to covert the dfTwo dataframe to the vals vector in my example.
First, let's create the dfTwo data frame.
dfTwo <- read.table(text = "C.1 C.2
X Z
Y T",
header = TRUE, stringsAsFactors = FALSE)
dfTwo
C.1 C.2
1 X Z
2 Y T
To complete the task, I load the dplyr and tidyr package.
library(dplyr)
library(tidyr)
Now I begin the transformation of dfTwo. The first step is to use stack function to convert the format.
dfTwo2 <- dfTwo %>%
stack() %>%
setNames(c("Col", "Group")) %>%
mutate(Group = as.character(Group))
dfTwo2
Col Group
1 X C.1
2 Y C.1
3 Z C.2
4 T C.2
The second step is to add the threshold information. One way to do this is to create a look-up table showing the association between Group and Value
threshold_df <- data.frame(Group = c("C.1", "C.2"),
Value = c(2, 4),
stringsAsFactors = FALSE)
threshold_df
Group Value
1 C.1 2
2 C.2 4
And then we can use the left_join function to combine the data frame.
dfTwo3 <- dfTwo2 %>% left_join(threshold_dt, by = "Group")
dfTwo3
Col Group Value
1 X C.1 2
2 Y C.1 2
3 Z C.2 4
4 T C.2 4
Now it is the third step. Notice that there is a column called J which does not need any threshold. So we need to add this information to dfTwo3. We can use the complete function from tidyr. The following code completes the data frame by adding Col in dat but not in dfTwo3 and NA to the Value.
dfTwo4 <- dfTwo3 %>% complete(Col = colnames(dat))
dfTwo4
# A tibble: 5 x 3
Col Group Value
<chr> <chr> <dbl>
1 J <NA> NA
2 T C.2 4
3 X C.1 2
4 Y C.1 2
5 Z C.2 4
The fourth step is arrange the right order of dfTwo4. We can achieve this by turning Col to factor and assign the level based on the order of the column name in dat.
dfTwo5 <- dfTwo4 %>%
mutate(Col = factor(Col, levels = colnames(dat))) %>%
arrange(Col) %>%
mutate(Col = as.character(Col))
dfTwo5
# A tibble: 5 x 3
Col Group Value
<chr> <chr> <dbl>
1 X C.1 2
2 Y C.1 2
3 Z C.2 4
4 T C.2 4
5 J <NA> NA
We are almost there. Now we can create vals from dfTwo5.
vals <- dfTwo5$Value
names(vals) <- dfTwo5$Col
vals
X Y Z T J
2 2 4 4 NA
Now we are ready to use the purrr package to filter the data.
The aboved are the breakdown of steps. We can combine all these steps into the following code for simlicity.
library(dplyr)
library(tidyr)
threshold_df <- data.frame(Group = c("C.1", "C.2"),
Value = c(2, 4),
stringsAsFactors = FALSE)
dfTwo2 <- dfTwo %>%
stack() %>%
setNames(c("Col", "Group")) %>%
mutate(Group = as.character(Group)) %>%
left_join(threshold_df, by = "Group") %>%
complete(Col = colnames(dat)) %>%
mutate(Col = factor(Col, levels = colnames(dat))) %>%
arrange(Col) %>%
mutate(Col = as.character(Col))
vals <- dfTwo2$Value
names(vals) <- dfTwo2$Col
dfOne[Reduce(intersect, list(which(dfOne["X"] > 2),
which(dfOne["Y"] > 2),
which(dfOne["Z"] > 4),
which(dfOne["T"] > 4))),]
# X Y Z T J
#1 3 4 5 6 1
Or iteratively (so fewer inequalities are tested):
vals = c(X = 2, Y = 2, Z = 4, T = 4) # from #lmo's answer
dfOne[Reduce(intersect, lapply(names(vals), function(x) which(dfOne[x] > vals[x]))),]
# X Y Z T J
#1 3 4 5 6 1
I'm writing this assuming that the second DF is meant to categorize the fields in the first DF. It's way simpler if you don't need to use the second one to define the conditions:
dfNew = dfOne[dfOne$X > 2 & dfOne$Y > 2 & dfOne$Z > 4 & dfOne$T > 4, ]
Or, using dplyr:
library(dplyr)
dfNew = dfOne %>% filter(X > 2 & Y > 2 & Z > 4 & T > 4)
In case that's all you need, I'll save this comment while I poke at the more complicated version of the question.

Resources