I currently have a dataframe like the one below of a bunch of pairwise correlations:
Data
structure(list(ID1 = c("A", "A", "A", "B", "B", "C"), ID2 = c("B",
"C", "D", "C", "D", "D"), cor = c(0.6, 0.6, 0.2, 0.1, 0.9, 0.2
), value1 = c(50L, 50L, 50L, 20L, 20L, 30L), value2 = c(20L,
30L, 100L, 30L, 100L, 100L)), class = "data.frame", row.names = c(NA,
-6L))
ID1 ID2 cor value1 value2
1 A B 0.6 50 20
2 A C 0.6 50 30
3 A D 0.2 50 100
4 B C 0.1 20 30
5 B D 0.9 20 100
6 C D 0.2 30 100
I'm trying to get the sum of all IDs (i.e. B) of the product between cor and either value1 or value2 depending on whether it is from ID1 or ID2.
For instance, the sum of B would be (cor x value)
(0.6 x 50) + (0.1 x 30) + (0.9 x 100)
I essentially would need to do this for around 20000 unique IDs. I hope this makes sense. I'm not that great in R (yet)!
Does this achieve what you need?
library(tidyverse)
df2 <- df %>%
pivot_longer(names_to = "names", values_to = "values", -c(cor:value2)) %>%
mutate(value = if_else(names == "ID1", value2, value1),
sum = cor * value) %>%
group_by(values) %>%
summarise(sum = sum(sum))
Unless you're looking for dplyr way of answering it, here's a quick but a little inelegant way of doing it:
cond1 <- df$ID1[df$ID1 == "B"]
sum1 <- sum(df$cor[cond1] * df$value1[cond1])
cond2 <- df$ID2[df$ID2 == "B"]
sum2 <- sum(df$cor[cond2] * df$value2[cond2])
finalsum = sum1 + sum2
Basically you want to first look at which row B is in ID1, and then do the product-sum, and then look at which row B is in ID2 and do the same.
Update:
What if you have thousands of ID? Again, I like it quick so create a function out of it:
prodsum <- function (df, ID) {
cond1 <- df$ID1[df$ID1 == ID]
sum1 <- sum(df$cor[cond1] * df$value1[cond1])
cond2 <- df$ID2[df$ID2 == ID]
sum2 <- sum(df$cor[cond2] * df$value2[cond2])
return(sum1 + sum2)
}
Then prodsum(df, "B") will give you the answer for original question. And you can use sapply() to do the job of cycling through thousands of IDs:
IDs <- unique(c(df$ID1, df$ID2))
sapply(IDs, function (x) prodsum(df, x)
There may or may not be a problem if an ID exists exclusively in ID1 or ID2. I'm sure you can write a conditional to deal with the problem.
Another way of looking the thing as below.
Assuming your data frame name is a
a1 <- subset(a,select=c(ID1,cor,value1))
a2 <- subset(a,select=c(ID2,cor,value1))
colnames(a2)[colnames(a2) == "ID2"] <- "ID1"
a3 <- rbind(a1,a2)
a3$MULTIPLY1 <- a3$cor * a3$value1
a4 <- a3 %>% group_by(ID1) %>% summarise(FINALVALUE = sum(MULTIPLY1))
# A tibble: 4 x 2
ID1 FINALVALUE
<chr> <dbl>
1 A 70
2 B 50
3 C 38
4 D 34
Hope this will help to some extent...!
Related
Say I have a df like so:
T1 <- c("a","b","c","d","e")
T2 <- c("f","g","h","i","j")
score1 <- c(NA,0.01,0.5,0.78,NA)
score2 <- c(1, 2, 3, NA, 6)
df <- data.frame(T1, T2, score1, score2)
df
T1 T2 score1 score2
1 a f NA 1
2 b g 0.01 2
3 c h 0.50 3
4 d i 0.78 NA
5 e j NA 6
If I want to randomly create new T1-T2 pairs, how can I see if these new pairs are in the df but only if score1 column is not NA?
In other words, I randomly sample, say, 2 values from T1 and T2:
(l1 <- sample(df$T1, 2))
(l2 <- sample(df$T2, 2))
and get:
> l1
[1] "c" "d"
> l2
[1] "h" "g"
How would one go about to get the score2 of the c-h and d-g pairs from df but only if score1 is not NA?
My first instinct would be to create a new df2 without NAs in the score1 column:
df2 <- df[which(!is.na(df$score1)), ]
Then I can create a new df for the new pairs:
df3$X1 <- l1
df3$X2 <- l2
df3$X3 <- l2
df3$X4 <- l1
#stack X3 with X1 and X4 with X2 (considering that T1-T2 pair is the same as T2-T1 pair)
df4 <- data.frame(T1 = c(df3[,"X1"], df3[,"X3"]),
T2 = c(df3[,"X2"], df3[,"X4"]))
> df4
T1 T2
1 c h
2 d g
3 h c
4 g d
But I'm missing the last step of how to get see if the paired columns from df4 match the paired columns in df2. In the end, I want to get something like:
df
T1 T2 score1 score2
1 c h 0.50 3
2 d g NA NA
I think a merge/join operation makes sense here:
res <- merge(df, data.frame(T1=l1, T2=l2, found=TRUE), by = c("T1","T2"), all = TRUE)
subset(res, found, select = -found)
# T1 T2 score1 score2
# 3 c h 0.5 3
# 4 d g NA NA
Data
df <- structure(list(T1 = c("a", "b", "c", "d", "e"), T2 = c("f", "g", "h", "i", "j"), score1 = c(NA, 0.01, 0.5, 0.78, NA), score2 = c(1, 2, 3, NA, 6)), class = "data.frame", row.names = c(NA, -5L))
l1 <- c("c", "d"); l2 <- c("h", "g")
Something like this?
set.seed(2022)
(l1 <- sample(df$T1, 2))
#> [1] "d" "c"
(l2 <- sample(df$T2, 2))
#> [1] "h" "i"
mapply(\(x1, x2, data){
i <- match(x1, data$T1)
j <- match(x2, data$T2)
if(any(is.na(c(data$score1[i], data$score1[i])))) {
NA_real_
} else {
sum(c(data$score2[i], -1*data$score2[j]), na.rm = TRUE)
}
}, l1, l2, MoreArgs = list(data = df))
#> d c
#> -3 3
Created on 2022-01-30 by the reprex package (v2.0.1)
I would like to ask how can I rearrange my dataset that fulfils the following
[
Original :
Group Value_y Value_z
1 m a
1 n a
2 o b
2 p b
Intended:
Group Value_a Value_b
1 m n
2 o p
]1
which involves separating value_y according to value_z and adding a new column according to the group number. Will potential need to average a separate column's values and add as a new column the same way.
Thank you!
In data.table we can use dcast :
library(data.table)
dcast(setDT(df), Group~rowid(Value_z), value.var = 'Value_y')
# Group 1 2
#1: 1 m n
#2: 2 o p
data
df <- structure(list(Group = c(1L, 1L, 2L, 2L), Value_y = c("m", "n",
"o", "p"), Value_z = c("a", "a", "b", "b")), class = "data.frame",
row.names = c(NA, -4L))
There is a dplyr solution. Define
Uneven = seq(1, dim(A)[1] - 1, by = 2)
Even = seq(2, dim(A)[1], by = 2)
with
A = data.frame(Group = c(1, 1, 2, 2), Value_y = c("m", "n", "o", "p"))
Then, you can use the pipe and some dplyr functionality to get
A2 = A %>%
dplyr::group_by(Group) %>%
dplyr::mutate(Row_1 = Value_y[Uneven]) %>%
dplyr::mutate(Row_2 = Value_y[Even]) %>%
dplyr::select(-Value_y) %>%
dplryr::slice(1)
and the output
> A2
# A tibble: 2 x 3
# Groups: Group [2]
Group Row_1 Row_2
<dbl> <fct> <fct>
1 1 m n
2 2 o p
Note that this solution presupposes two-pairs of Groups, i.e. an even number of observations.
I have a DT with multiple columns and I need to give a condition in ifelse and do the calculations accordingly. I want it to do count/sum(count) grouped by segment. Here is the DT
Segment Count Flag
A 23 Y
B 45 N
A 56 N
B 212 Y
I want the fourth column as count per total count of the segment based on the flag so the out put should look something like this. For flag N it is the share of the count per segment. For flag Y, it is the revenue percentage calculation if the No(N) becomes Yes(Y) and in that case the revenue that could be earned. I am sorry as it is clumsy but kindly ask me in comments if you have any doubts.
Segment Count Flag Rev Value
A 23 Y 34 ((56/23)*34)/(34+69)
B 45 N 48 45/(45+212)
A 56 N 23 56/(56+23)
B 212 Y 67 ((45/212)*67)/(67+12)
A 65 Y 69 ...
B 10 Y 12 ...
Any help is appreciated. Thanks!
We can do this with data.table. Convert the 'data.frame' to 'data.table' (setDT(DT)), grouped by 'Segment', create the 'Value' column by diviing the 'Count' by the sum of 'Count', then we update the 'Value' where the Flag' is 'N'
library(data.table)
setDT(DT)[, Value := Count/sum(Count), Segment
][Flag == "N", Value := Count/sum(Count), Segment]
DT
# Segment Count Flag Value
#1: A 23 Y 0.18852459
#2: B 45 N 1.00000000
#3: A 56 N 1.00000000
#4: B 212 Y 0.78810409
#5: A 43 Y 0.35245902
#6: B 12 Y 0.04460967
Just checking with the OPs expected output 'Value'
> 23/122
#[1] 0.1885246
> 212/269
#[1] 0.7881041
> 43/122
#[1] 0.352459
> 12/269
#[1] 0.04460967
Update3
Based on the update No:3 in Op's post
s1 <- setDT(DT1)[, .(rn = .I[Flag == "Y"], Value = (Rev[Flag=="Y"] *
(Count[Flag == "N"]/Count[Flag=="Y"]))/sum(Rev[Flag == "Y"])), Segment]
s2 <- DT1[, .(rn = .I[Flag == "N"], Value = Count[Flag == "N"]/(Count[Flag == "N"] +
Count[Flag=="Y"][1])), Segment]
DT1[, Value := rbind(s1, s2)[order(rn)]$Value]
DT1
# Segment Count Flag Rev Value
#1: A 23 Y 34 0.8037146
#2: B 45 N 48 0.1750973
#3: A 56 N 23 0.7088608
#4: B 212 Y 67 0.1800215
#5: A 65 Y 69 0.5771471
#6: B 10 Y 12 0.6835443
>((56/23)*34)/(34+69)
#[1] 0.8037146
> 45/(45+212)
#[1] 0.1750973
> 56/(56+23)
#[1] 0.7088608
> ((45/212)*67)/(67+12)
#[1] 0.1800215
data
DT <- structure(list(Segment = c("A", "B", "A", "B", "A", "B"), Count = c(23L,
45L, 56L, 212L, 43L, 12L), Flag = c("Y", "N", "N", "Y", "Y",
"Y")), .Names = c("Segment", "Count", "Flag"), row.names = c(NA,
-6L), class = "data.frame")
DT1 <- structure(list(Segment = c("A", "B", "A", "B", "A", "B"), Count = c(23L,
45L, 56L, 212L, 65L, 10L), Flag = c("Y", "N", "N", "Y", "Y",
"Y"), Rev = c(34L, 48L, 23L, 67L, 69L, 12L)), .Names = c("Segment",
"Count", "Flag", "Rev"), class = "data.frame", row.names = c(NA,
-6L))
Alternatively we could have also used dplyr pkg for that...
Updating based on the suggestions provided by #Aramis7d - thanks!
library(data.table)
df <- fread("Segment Count Flag
A 23 Y
B 45 N
A 56 N
B 212 Y
A 43 Y
B 12 Y")
library(dplyr)
df %>%
group_by(Segment) %>%
mutate(Value = Count/sum(Count)) %>%
group_by(Segment, Flag) %>%
mutate(Value = if_else( Flag == "N", Count/sum(Count), Value))
For example, suppose that you had a function that applied some DPLYR functions, but you couldn't expect datasets passed to this function to have the same column names.
For a simplified example of what I mean, say you had a data frame, arizona.trees:
arizona.trees
group arizona.redwoods arizona.oaks
A 23 11
A 24 12
B 9 8
B 10 7
C 88 22
and another very similar data frame, california.trees:
california.trees
group california.redwoods california.oaks
A 25 50
A 11 33
B 90 5
B 77 3
C 90 35
And you wanted to implement a function that returns the mean for the given groups (A, B, ... Z) for a given type of tree that would work for both of these data frames.
foo <- function(dataset, group1, group2, tree.type) {
column.name <- colnames(dataset[2])
result <- filter(dataset, group %in% c(group1, group2) %>%
select(group, contains(tree.type)) %>%
group_by(group) %>%
summarize("mean" = mean(column.name))
return(result)
}
A desired output for a call of foo(california.trees, A, B, redwoods) would be:
result
mean
A 18
B 83.5
For some reason, doing something like the implementation of foo() just doesn't seem to work. This is likely due to some error with the data frame indexing - the function seems to think I am attempting to get the mean of the column.name string, rather than retrieving the column and passing the column to mean(). I'm not sure how to avoid this. There's the issue of the implicit passing of the modified dataframe that can't be directly referenced with the pipe operator that may be causing the issue.
Why is this? Is there some alternative implementation that would work?
We can use the quosure based solution from the devel version of dplyr (soon to be released 0.6.0)
foo <- function(dataset, group1, group2, tree.type){
group1 <- quo_name(enquo(group1))
group2 <- quo_name(enquo(group2))
colN <- rlang::parse_quosure(names(dataset)[2])
tree.type <- quo_name(enquo(tree.type))
dataset %>%
filter(group %in% c(group1, group2)) %>%
select(group, contains(tree.type)) %>%
group_by(group) %>%
summarise(mean = mean(UQ(colN)))
}
foo(california.trees, A, B, redwoods)
# A tibble: 2 × 2
# group mean
# <chr> <dbl>
#1 A 18.0
#2 B 83.5
foo(arizona.trees, A, B, redwoods)
# A tibble: 2 × 2
# group mean
# <chr> <dbl>
#1 A 23.5
#2 B 9.5
The enquotakes the input arguments and converts it to quosure, with quo_name, it is converted to string for using with %in%, the second column name is converted to quosure from string using parse_quosure and then it is unquoted (UQ or !!) for evaluation within summarise
NOTE: This is based on the OP's function about selecting the second column
The above solution was based on selecting the column based on position (as per the OP's code) and it may not work for other columns. So, we can match the 'tree.type' and get the 'mean' of the columns based on that
foo1 <- function(dataset, group1, group2, tree.type){
group1 <- quo_name(enquo(group1))
group2 <- quo_name(enquo(group2))
tree.type <- quo_name(enquo(tree.type))
dataset %>%
filter(group %in% c(group1, group2)) %>%
select(group, contains(tree.type)) %>%
group_by(group) %>%
summarise_at(vars(contains(tree.type)), funs(mean = mean(.)))
}
The function can be tested for different columns in the two datasets
foo1(arizona.trees, A, B, oaks)
# A tibble: 2 × 2
# group mean
# <chr> <dbl>
#1 A 11.5
#2 B 7.5
foo1(arizona.trees, A, B, redwood)
# A tibble: 2 × 2
# group mean
# <chr> <dbl>
#1 A 23.5
#2 B 9.5
foo1(california.trees, A, B, redwood)
# A tibble: 2 × 2
# group mean
# <chr> <dbl>
#1 A 18.0
#2 B 83.5
foo1(california.trees, A, B, oaks)
# A tibble: 2 × 2
# group mean
# <chr> <dbl>
#1 A 41.5
#2 B 4.0
data
arizona.trees <- structure(list(group = c("A", "A", "B", "B", "C"),
arizona.redwoods = c(23L,
24L, 9L, 10L, 88L), arizona.oaks = c(11L, 12L, 8L, 7L, 22L)),
.Names = c("group",
"arizona.redwoods", "arizona.oaks"), class = "data.frame",
row.names = c(NA, -5L))
california.trees <- structure(list(group = c("A", "A", "B", "B", "C"),
california.redwoods = c(25L,
11L, 90L, 77L, 90L), california.oaks = c(50L, 33L, 5L, 3L, 35L
)), .Names = c("group", "california.redwoods", "california.oaks"
), class = "data.frame", row.names = c(NA, -5L))
I have a dataframe looking like this:
id value1 value2 value3 value4
A 14 24 22 9
B 51 25 29 33
C 4 16 8 10
D 1 4 2 4
Now I want to compare each column of the row with the others rows in order to identify the rows where every value is higher.
So, for example for id D this would be A, B and C.
For C it would be B, for A it's B and for B there is no row.
I tried to do that by looping through the rows and comparing every column, but that takes a lot of time. The original dataset has about 5000 rows and 20 columns to compare.
I am sure that there is a way to do that more efficiently. Thanks for your help!
I don't know a simple function to do this task.
Here is how I would do.
library(dplyr)
DF <- data.frame(
id = c("A", "B", "C", "D"),
value1 = c(14, 51, 4, 1),
value2 = c(24, 25, 16, 4),
value3 = c(22, 29, 8, 2),
value4 = c(9, 33, 10, 4),
stringsAsFactors = FALSE)
# get the order for each value
tmp <- lapply(select(DF, -id), function(x) DF$id[order(x)])
# find a set of "biggers" for each id
tmp <- lapply(tmp, function(x) data.frame(
id = rep(x, rev(seq_along(x))-1),
bigger = x[lapply(seq_along(x), function(i)
which(seq_along(x) > i)) %>% unlist()],
stringsAsFactors = FALSE))
# inner_join all, this keeps "biggers" in all columns
out <- NULL
for (v in tmp) {
if (is.null(out)) {
out <- v
} else {
out <- inner_join(out, v, by = c("id", "bigger"))
}
}
This gets you:
out
# id bigger
#1 D C
#2 D A
#3 D B
#4 C B
#5 A B
Here's an approach that returns results in a data frame format.
library(tidyr)
library(dplyr)
# reshape data to long format
td <- d %>% gather(key, value, value1:value4)
# create a copy w/ different names for merging
td2 <- td %>% select(id2 = id, key, value2 = value)
# full outer join to produce one row per pair of IDs
dd <- merge(td, td2, by = "key", all = TRUE)
# the result
dd %>%
filter(id != id2) %>%
group_by(id, id2) %>%
summarise(all_less = !any(value >= value2)) %>%
filter(all_less)
results (id is less than id2)
id id2 all_less
(fctr) (fctr) (lgl)
1 A B TRUE
2 C B TRUE
3 D A TRUE
4 D B TRUE
5 D C TRUE
data
d <- structure(list(
id = structure(1:4, .Label = c("A", "B", "C", "D"), class = "factor"),
value1 = c(14L, 51L, 4L, 1L),
value2 = c(24L, 25L, 16L, 4L),
value3 = c(22L, 29L, 8L, 2L), value4 = c(9L, 33L, 10L, 4L)
),
.Names = c("id", "value1", "value2", "value3", "value4"),
class = "data.frame", row.names = c(NA, -4L)
)
I think this works just fine:
ind <- which(names(df) == "id")
apply(df[,-ind],1,function(x) df$id[!rowSums(!t(x < t(df[,-ind])))] )
# [[1]]
# [1] "B"
#
# [[2]]
# character(0)
#
# [[3]]
# [1] "B"
#
# [[4]]
# [1] "A" "B" "C"