library(dplyr)
library(tidyr)
df <- data.frame(
First = c("MW3", "MW3", "MW4", "MW5", "MW6", "MW7", "MW7", "MW8"),
Second = c("MW4; MW5; MW6", "MW5; MW3; MW7", "MW8; MW7; MW3",
"MW5; MW6; MW4", "MW3; MW7; MW8", "MW6; MW8; MW4",
"MW3; MW4; MW5", "MW6; MW3; MW7")
)
df <- df %>%
mutate(
ID = row_number(),
lmt = n_distinct(ID)
) %>%
separate_rows(Second, sep = "; ") %>%
group_by(ID) %>%
mutate(
wgt = row_number()
) %>% ungroup()
Let's say that for each ID I want to keep only 1 combination of First and Second (i.e. the length of unique IDs in df should always be equal to lmt).
However, I'd like to do that with optimizing certain parameters. The solution should be designed in such a way that:
Combinations with wgt 1 should be selected whenever possible, alternatively also 2, but 3 should be avoided (i.e. sum of wgt should be minimal);
The difference between the frequency of a value in Second and frequency in First should be close to 0.
Any ideas on how to approach this in R?
Expected output for the above case is:
ID First Second wgt lmt
1 1 MW3 MW4 1 8
2 2 MW3 MW7 3 8
3 3 MW4 MW7 2 8
4 4 MW5 MW5 1 8
5 5 MW6 MW3 1 8
6 6 MW7 MW8 2 8
7 7 MW7 MW3 1 8
8 8 MW8 MW6 1 8
Why? Simply because with this combination, there is not more of any element on the right side (Second) that it is on the left (First). For example, there are two MW3 elements on the right as well as on the left.
However, the price to pay here is that wgt is not always 1 (sum of wgt is not 8 but 12).
Clarification: In case both criteria cannot be minimized at the same time, the minimization of 2nd criteria (difference between frequencies) should be prioritized.
I played around with this problem and I can share a solution using a variation of minconflicts algorithm. The key here is to find a scoring function that combines your requirements. The implementation below follows your recommendation 'let's say the objective should be to prioritize the minimization of 2nd criteria (difference between frequencies)'. Experiment with other scoring functions on your actual data and let's see how far you get.
On your original data (8 IDs) I got solution equally good as the one you posted:
> solution_summary(current_solution)
Name FirstCount SecondCount diff
1: MW3 2 2 0
2: MW4 1 1 0
3: MW5 1 1 0
4: MW6 1 1 0
5: MW7 2 2 0
6: MW8 1 1 0
[1] "Total freq diff: 0"
[1] "Total wgt: 12"
With random data with 10000 IDs the algorithm is able to find solution with no difference in First/Second frequencies (but sum of wgt is bigger than minimum):
> solution_summary(current_solution)
Name FirstCount SecondCount diff
1: MW3 1660 1660 0
2: MW4 1762 1762 0
3: MW5 1599 1599 0
4: MW6 1664 1664 0
5: MW7 1646 1646 0
6: MW8 1669 1669 0
[1] "Total freq diff: 0"
[1] "Total wgt: 19521"
Code below:
library(data.table)
df <- as.data.table(df)
df <- df[, .(ID, First, Second, wgt)]
# PLAY AROUND WITH THIS PARAMETER
freq_weight <- 0.9
wgt_min <- df[, uniqueN(ID)]
wgt_max <- df[, uniqueN(ID) * 3]
freq_min <- 0
freq_max <- df[, uniqueN(ID) * 2] #verify if this is the worst case scenario
score <- function(solution){
# compute raw scores
current_wgt <- solution[, sum(wgt)]
second_freq <- solution[, .(SecondCount = .N), by = Second]
names(second_freq)[1] <- "Name"
compare <- merge(First_freq, second_freq, by = "Name", all = TRUE)
compare[is.na(compare)] <- 0
compare[, diff := abs(FirstCount - SecondCount)]
current_freq <- compare[, sum(diff)]
# normalize
wgt_score <- (current_wgt - wgt_min) / (wgt_max - wgt_min)
freq_score <- (current_freq - freq_min) / (freq_max - freq_min)
#combine
score <- (freq_weight * freq_score) + ((1 - freq_weight) * wgt_score)
return(score)
}
#initialize random solution
current_solution <- df[, .SD[sample(.N, 1)], by = ID]
#get freq of First (this does not change)
First_freq <- current_solution[, .(FirstCount = .N), by = First]
names(First_freq)[1] <- "Name"
#get mincoflict to be applied on each iteration
minconflict <- function(df, solution){
#pick ID
change <- solution[, sample(unique(ID), 1)]
#get permissible values
values <- df[ID == change, .(Second, wgt)]
#assign scores
values[, score := NA_real_]
for (i in 1:nrow(values)) {
solution[ID == change, c("Second", "wgt") := values[i, .(Second, wgt)]]
set(values, i, "score", score(solution))
}
#return the best combination
scores <<- c(scores, values[, min(score)])
solution[ID == change, c("Second", "wgt") := values[which.min(score), .(Second, wgt)]]
}
#optimize
scores <- 1
iter <- 0
while(TRUE){
minconflict(df, current_solution)
iter <- iter + 1
#SET MAX NUMBER OF ITERATIONS HERE
if(scores[length(scores)] == 0 | iter >= 1000) break
}
# summarize obtained solution
solution_summary <- function(solution){
second_freq <- solution[, .(SecondCount = .N), by = Second]
names(second_freq)[1] <- "Name"
compare <- merge(First_freq, second_freq, by = "Name", all = TRUE)
compare[is.na(compare)] <- 0
compare[, diff := abs(FirstCount - SecondCount)]
print(compare)
print(paste("Total freq diff: ", compare[, sum(diff)]))
print(paste("Total wgt: ", solution[, sum(wgt)]))
}
solution_summary(current_solution)
This is basically a bipartite graph matching problem and so can be solved exactly in reasonable time, either by maxflow or linear programming (bipartite graph matching to match two sets).
library(lpSolve)
MISMATCH.COST <- 1000
.create.row <- function(row.names, first) {
row <- vector(mode="numeric", length=length(first))
for (i in 1:length(row.names))
row = row + (-MISMATCH.COST+i)*(row.names[i]==first)
return(row)
}
find.pairing <- function(First, Second) {
row.names = sapply(Second, strsplit, "; ")
# Create cost matrix for assignment
mat = sapply(row.names, .create.row, First)
assignment <- lp.assign(mat)
print("Total cost:")
print(assignment$objval+length(First)*MISMATCH.COST)
solution <- lp.assign(mat)$solution
pairs <- which(solution>0, arr.ind=T)
matches = First[pairs[,1]]
# Find out where a mismatch has occured, and replace match
for (i in 1:length(matches)) {
if (!(matches[i] %in% row.names[[i]])) {
matches[i] = row.names[[i]][1]
}
}
result = data.frame(
First[pairs[,2]],
matches)
return(result)
}
Running it on your example gives an optimal solution (as it should always do)
> First = c("MW3", "MW3", "MW4", "MW5", "MW6", "MW7", "MW7", "MW8")
> Second = c("MW4; MW5; MW6", "MW5; MW3; MW7", "MW8; MW7; MW3",
"MW5; MW6; MW4", "MW3; MW7; MW8", "MW6; MW8; MW4",
"MW3; MW4; MW5", "MW6; MW3; MW7")
Second = c("MW4; MW5; MW6", "MW5; MW3; MW7", "MW8; MW7; MW3",
+ "MW5; MW6; MW4", "MW3; MW7; MW8", "MW6; MW8; MW4",
+ "MW3; MW4; MW5", "MW6; MW3; MW7")
> find.pairing(First, Second)
[1] "Total cost:"
[1] 12
First.pairs...2.. matches
1 MW3 MW4
2 MW3 MW3
3 MW4 MW7
4 MW5 MW5
5 MW6 MW7
6 MW7 MW8
7 MW7 MW3
8 MW8 MW6
Related
Hello all a R noob here,
I hope you guys can help me with the following.
I need to transform multiple columns in my dataset to new columns based on the values in the original columns multiple times. This means that for the first transformation I use column 1, 2, 3 and if certain conditions are met the output results a new column with a 1 or a 0, for the second transformation I use columns 4, 5, 6 and the output should be a 1 or a 0 also. I have to do this 18 times. I already wrote a function which succesfully does the transformation if I impute the variables manually, but I would like to apply this function to all the desired columns at once. My desired output would be 18 new columns with 0's and 1's. Finally I will make a last column which will display a 1 if any of the 18 columns is a 1 and a 0 otherwise.
df <- data.frame(admiss1 = sample(seq(as.Date('1990/01/01'), as.Date('2000/01/01'), by="day"), 12),
admiss2 = sample(seq(as.Date('1990/01/01'), as.Date('2000/01/01'), by="day"), 12),
admiss3 = sample(seq(as.Date('1990/01/01'), as.Date('2000/01/01'), by="day"), 12),
visit1 = sample(seq(as.Date('1995/01/01'), as.Date('1996/01/01'), by="day"), 12),
visit2 = sample(seq(as.Date('1997/01/01'), as.Date('1998/01/01'), by="day"), 12),
reason1 = sample(3,12, replace = T),
reason2 = sample(3,12, replace = T),
reason3 = sample(3,12, replace = T))
df$discharge1 <- df$admiss1 + 10
df$discharge2 <- df$admiss2 + 10
df$discharge3 <- df$admiss3 + 10
#every discharge date is 10 days after the admission date for the sake of this example
#now I have the following dataframe
#for the sake of it I included only 3 dates and reasons(instead of 18)
admiss1 admiss2 admiss3 visit1 visit2 reason1 reason2 reason3 discharge1 discharge2 discharge3
1 1990-03-12 1992-04-04 1998-07-31 1995-01-24 1997-10-07 2 1 3 1990-03-22 1992-04-14 1998-08-10
2 1999-05-18 1990-11-25 1995-10-04 1995-03-06 1997-03-13 1 2 1 1999-05-28 1990-12-05 1995-10-14
3 1993-07-16 1998-06-10 1991-07-05 1995-11-06 1997-11-15 1 1 2 1993-07-26 1998-06-20 1991-07-15
4 1991-07-05 1992-06-17 1995-10-12 1995-05-14 1997-05-02 2 1 3 1991-07-15 1992-06-27 1995-10-22
5 1995-08-16 1999-03-08 1992-04-03 1995-02-20 1997-01-03 1 3 3 1995-08-26 1999-03-18 1992-04-13
6 1999-10-07 1991-12-26 1995-05-05 1995-10-24 1997-10-15 3 1 1 1999-10-17 1992-01-05 1995-05-15
7 1998-03-18 1992-04-18 1993-12-31 1995-11-14 1997-06-14 3 2 2 1998-03-28 1992-04-28 1994-01-10
8 1992-08-04 1991-09-16 1992-04-23 1995-05-29 1997-10-11 1 2 3 1992-08-14 1991-09-26 1992-05-03
9 1997-02-20 1990-02-12 1998-03-08 1995-10-09 1997-12-29 1 1 3 1997-03-02 1990-02-22 1998-03-18
10 1992-09-16 1997-06-16 1997-07-18 1995-12-11 1997-01-12 1 2 2 1992-09-26 1997-06-26 1997-07-28
11 1991-01-25 1998-04-07 1999-07-02 1995-12-27 1997-05-28 3 2 1 1991-02-04 1998-04-17 1999-07-12
12 1996-02-25 1993-03-30 1997-06-25 1995-09-07 1997-10-18 1 3 2 1996-03-06 1993-04-09 1997-07-05
admissdate <- function(admis, dis, rsn, vis1, vis2){
xnew <- ifelse(df[eval(substitute(admis))] >= df[eval(substitute(vis1))] & df[eval(substitute(dis))] <= df[eval(substitute(vis2))] & df[eval(substitute(rsn))] == 2, 1, 0)
xnew <- ifelse(df[eval(substitute(admis))] >= df[eval(substitute(vis1))] & df[eval(substitute(admis))] <= df[eval(substitute(vis2))] & df[eval(substitute(dis))] >= df[eval(substitute(vis2))] & df[eval(substitute(rsn))] == 2, 1, xnew)
return(xnew)
}
I wrote this function to generate a 1 if the conditions are true and a 0 if the conditions are false.
-Condition 1: admission date and discharge date are between visit 1 and visit 2 + admission reason is 2.
-Condition 2: admission date is after visit 1 but before visit 2 and the discharge date is after visit 2 with also admission reason 2.
It should return 1 if these conditions are true and 0 if these conditions are false. Eventually, I will end up with 18 new variables with 1's or 0's and will combine them to make one variable with Admission between visit 1 and visit 2 (with reason 2).
If I manually impute the variable names it will work, but I cant make it work for all the variables at once. I tried to make a string vector with all the admiss dates, discharge dates and reasons and tried to transform them with mapply, but this does not work.
admiss <- paste0(rep("admiss", 3), 1:3)
discharge <- paste0(rep("discharge", 3), 1:3)
reason <- paste0(rep("reason", 3), 1:3)
visit1 <- rep("visit1",3)
visit2 <- rep("visit2",3)
mapply(admissdate, admis = admiss, dis = discharge, rsn = reason, vis1 = visit1, vis2 = visit2)
I have also considered lapply but here you have to define an X = ..., which I think I cannot use because I have multiple column that I want to impute, please correct me if I am wrong!
Also I considered using a for loop, but I don't know how to use that with multiple conditions.
Any help would be greatly appreciated!
You can change the function to accept values instead of column names.
admissdate <- function(admis, dis, rsn, vis1, vis2){
xnew <- as.integer(admis >= vis1 & dis <= vis2 & rsn == 2)
xnew <- ifelse(admis >= vis1 & admis <= vis2 & dis >= vis2 & rsn == 2, 1, xnew)
return(xnew)
}
Now create new columns -
admiss <- paste0("admiss", 1:3)
discharge <- paste0("discharge", 1:3)
reason <- paste0("reason", 1:3)
new_col <- paste0('newcol', 1:3)
df[new_col] <- Map(function(x, y, z) admissdate(x, y, z, df$visit1, df$visit2),
df[admiss],df[discharge],df[reason])
#Additional column will be 1 if any of the value in the new column is 1.
df$result <- as.integer(rowSums(df[new_col]) > 0)
df
I am beginner of R. I need to transfer some Eviews code to R. There are some loop code to add 10 or more columns\variables with some function in data in Eviews.
Here are eviews example code to estimate deflator:
for %x exp con gov inv cap ex im
frml def_{%x} = gdp_{%x}/gdp_{%x}_r*100
next
I used dplyr package and use mutate function. But it is very hard to add many variables.
library(dplyr)
nominal_gdp<-rnorm(4)
nominal_inv<-rnorm(4)
nominal_gov<-rnorm(4)
nominal_exp<-rnorm(4)
real_gdp<-rnorm(4)
real_inv<-rnorm(4)
real_gov<-rnorm(4)
real_exp<-rnorm(4)
df<-data.frame(nominal_gdp,nominal_inv,
nominal_gov,nominal_exp,real_gdp,real_inv,real_gov,real_exp)
df<-df %>% mutate(deflator_gdp=nominal_gdp/real_gdp*100,
deflator_inv=nominal_inv/real_inv,
deflator_gov=nominal_gov/real_gov,
deflator_exp=nominal_exp/real_exp)
print(df)
Please help me to this in R by loop.
The answer is that your data is not as "tidy" as it could be.
This is what you have (with an added observation ID for clarity):
library(dplyr)
df <- data.frame(nominal_gdp = rnorm(4),
nominal_inv = rnorm(4),
nominal_gov = rnorm(4),
real_gdp = rnorm(4),
real_inv = rnorm(4),
real_gov = rnorm(4))
df <- df %>%
mutate(obs_id = 1:n()) %>%
select(obs_id, everything())
which gives:
obs_id nominal_gdp nominal_inv nominal_gov real_gdp real_inv real_gov
1 1 -0.9692060 -1.5223055 -0.26966202 0.49057546 2.3253066 0.8761837
2 2 1.2696927 1.2591910 0.04238958 -1.51398652 -0.7209661 0.3021453
3 3 0.8415725 -0.1728212 0.98846942 -0.58743294 -0.7256786 0.5649908
4 4 -0.8235101 1.0500614 -0.49308092 0.04820723 -2.0697008 1.2478635
Consider if you had instead, in df2:
obs_id variable real nominal
1 1 gdp 0.49057546 -0.96920602
2 2 gdp -1.51398652 1.26969267
3 3 gdp -0.58743294 0.84157254
4 4 gdp 0.04820723 -0.82351006
5 1 inv 2.32530662 -1.52230550
6 2 inv -0.72096614 1.25919100
7 3 inv -0.72567857 -0.17282123
8 4 inv -2.06970078 1.05006136
9 1 gov 0.87618366 -0.26966202
10 2 gov 0.30214534 0.04238958
11 3 gov 0.56499079 0.98846942
12 4 gov 1.24786355 -0.49308092
Then what you want to do is trivial:
df2 %>% mutate(deflator = real / nominal)
obs_id variable real nominal deflator
1 1 gdp 0.49057546 -0.96920602 -0.50616221
2 2 gdp -1.51398652 1.26969267 -1.19240392
3 3 gdp -0.58743294 0.84157254 -0.69801819
4 4 gdp 0.04820723 -0.82351006 -0.05853872
5 1 inv 2.32530662 -1.52230550 -1.52749012
6 2 inv -0.72096614 1.25919100 -0.57256297
7 3 inv -0.72567857 -0.17282123 4.19901294
8 4 inv -2.06970078 1.05006136 -1.97102841
9 1 gov 0.87618366 -0.26966202 -3.24919196
10 2 gov 0.30214534 0.04238958 7.12782060
11 3 gov 0.56499079 0.98846942 0.57158146
12 4 gov 1.24786355 -0.49308092 -2.53074800
So the question becomes: how do we get to the nice dplyr-compatible data.frame.
You need to gather your data using tidyr::gather. However, because you have 2 sets of variables to gather (the real and nominal values), it is not straightforward. I have done it in two steps, there may be a better way though.
real_vals <- df %>%
select(obs_id, starts_with("real")) %>%
# the line below is where the magic happens
tidyr::gather(variable, real, starts_with("real")) %>%
# extracting the variable name (by erasing up to the underscore)
mutate(variable = gsub(variable, pattern = ".*_", replacement = ""))
# Same thing for nominal values
nominal_vals <- df %>%
select(obs_id, starts_with("nominal")) %>%
tidyr::gather(variable, nominal, starts_with("nominal")) %>%
mutate(variable = gsub(variable, pattern = ".*_", replacement = ""))
# Merging them... Now we have something we can work with!
df2 <-
full_join(real_vals, nominal_vals, by = c("obs_id", "variable"))
Note the importance of the observation id when merging.
We can grep the matching names, and sort:
x <- colnames(df)
df[ sort(x[ (grepl("^nominal", x)) ]) ] /
df[ sort(x[ (grepl("^real", x)) ]) ] * 100
Similarly, if the columns were sorted, then we could just:
df[ 1:4 ] / df[ 5:8 ] * 100
We can loop over column names using purrr::map_dfc then apply a custom function over the selected columns (i.e. the columns that matched the current name from nms)
library(dplyr)
library(purrr)
#Replace anything before _ with empty string
nms <- unique(sub('.*_','',names(df)))
#Use map if you need the ouptut as a list not a dataframe
map_dfc(nms, ~deflator_fun(df, .x))
Custom function
deflator_fun <- function(df, x){
#browser()
nx <- paste0('nominal_',x)
rx <- paste0('real_',x)
select(df, matches(x)) %>%
mutate(!!paste0('deflator_',quo_name(x)) := !!ensym(nx) / !!ensym(rx)*100)
}
#Test
deflator_fun(df, 'gdp')
nominal_gdp real_gdp deflator_gdp
1 -0.3332074 0.181303480 -183.78433
2 -1.0185754 -0.138891362 733.36121
3 -1.0717912 0.005764186 -18593.97398
4 0.3035286 0.385280401 78.78123
Note: Learn more about quo_name, !!, and ensym which they are tools for programming with dplyr here
I have a matrix like this
head(a)
# A tibble: 6 x 4
date ROE ROFE ROTFE
<date> <dbl> <dbl> <dbl>
1 2000-01-31 0.033968932 0.0324214815 0.010205926
2 2000-02-29 0.006891111 -0.0003352941 -0.005230147
3 2000-03-31 0.006158519 0.0213992647 0.040399265
4 2000-04-28 0.060022222 0.0151191176 0.047586029
5 2000-05-31 -0.016960000 -0.0287617647 -0.036209559
6 2000-06-30 0.034133577 0.0144456522 0.030756522
I want to pick the value of a factor which has highest cumulative return last 2 months over time.
I have done something like this and it works.
However, my friend told me that it can be done in one or two lines of dplyr and I'm wondering if you could please show me how to do that.
index = as.Date(unique(a$date))
nmonth = 2;
mean.ROE = numeric()
for (i in 1:(length(index) - nmonth)) { # i = 2
index1 = index[i]
index2 = index[nmonth + i]
index3 = index[nmonth + i+1]
# Take a 2-month window of ROE returns:
b = a[a$date >= index1 & a$date < index2,] %>% mutate(cum.ROE = cumprod(1 + ROE)) %>% mutate(cum.ROFE = cumprod(1 + ROFE)) %>% mutate(cum.ROTFE = cumprod(1 + ROTFE))
# Use the cumulative return over the 2-month window to determine which factor is best.
mean.ROE1 = ifelse(b$cum.ROE[nmonth] > b$cum.ROFE[nmonth] & b$cum.ROE[nmonth] > b$cum.ROTFE[nmonth], a[a$date == index3,]$ROE, ifelse(b$cum.ROFE[nmonth] > b$cum.ROE[nmonth] & b$cum.ROFE[nmonth] > b$cum.ROTFE[nmonth], a[a$date == index3,]$ROFE, a[a$date == index3,]$ROTFE))
# Bind the answer to the answer vector
mean.ROE = rbind(mean.ROE, mean.ROE1)
}
Create a function maxret which takes 2 + nmonth rows, x, and calculates the cumulative returns, r, for each column of the first two rows. For the largest of those return the value in the last row of x.
Now use rollapplyr to apply it to a rolling window of width 2 + month:
library(zoo)
maxret <- function(x) {
r <- apply(1 + x[1:2, ], 2, prod)
x[2 + nmonth, which.max(r)]
}
z <- read.zoo(as.data.frame(a))
res <- rollapplyr(z, 2 + nmonth, maxret, by.column = FALSE)
giving the zoo series:
> res
2000-04-28 2000-05-31 2000-06-30
0.06002222 -0.03620956 0.03075652
If you want a data frame use fortify.zoo(res) .
Note: 1 The input was not provided in reproducible form in the question so I have assumed this data.frame:
Lines <-
"date ROE ROFE ROTFE
1 2000-01-31 0.033968932 0.0324214815 0.010205926
2 2000-02-29 0.006891111 -0.0003352941 -0.005230147
3 2000-03-31 0.006158519 0.0213992647 0.040399265
4 2000-04-28 0.060022222 0.0151191176 0.047586029
5 2000-05-31 -0.016960000 -0.0287617647 -0.036209559
6 2000-06-30 0.034133577 0.0144456522 0.030756522"
a <- read.table(text = Lines, header = TRUE)
Note 2: With the input in Note 1 or with zoo 1.8.1 (the development version of zoo) this line:
z <- read.zoo(as.data.frame(a))
could be simplified to just:
z <- read.zoo(a)
but we have added the as.data.frame part in the main code so it works with tibbles as well as straight data frames even with the current version of zoo on CRAN.
How do I get a histogram-like summary of interval data in R?
My MWE data has four intervals.
interval range
Int1 2-7
Int2 10-14
Int3 12-18
Int4 25-28
I want a histogram-like function which counts how the intervals Int1-Int4 span a range split across fixed-size bins.
The function output should look like this:
bin count which
[0-4] 1 Int1
[5-9] 1 Int1
[10-14] 2 Int2 and Int3
[15-19] 1 Int3
[20-24] 0 None
[25-29] 1 Int4
Here the range is [minfloor(Int1, Int2, Int3, Int40), maxceil(Int1, Int2, Int3, Int4)) = [0,30) and there are six bins of size = 5.
I would greatly appreciate any pointers to R packages or functions that implement the functionality I want.
Update:
So far, I have a solution from the IRanges package which uses a fast data structure called NCList, which is faster than Interval Search Trees according to users.
> library(IRanges)
> subject <- IRanges(c(2,10,12,25), c(7,14,18,28))
> query <- IRanges(c(0,5,10,15,20,25), c(4,9,14,19,24,29))
> countOverlaps(query, subject)
[1] 1 1 2 1 0 1
But I am still unable to get which are the ranges that overlap. Will update if I get through.
Using IRanges, you should use findOverlaps or mergeByOverlaps instead of countOverlaps. It, by default, doesn't return no matches though.
I'll leave that to you. Instead, will show an alternate method using foverlaps() from data.table package:
require(data.table)
subject <- data.table(interval = paste("int", 1:4, sep=""),
start = c(2,10,12,25),
end = c(7,14,18,28))
query <- data.table(start = c(0,5,10,15,20,25),
end = c(4,9,14,19,24,29))
setkey(subject, start, end)
ans = foverlaps(query, subject, type="any")
ans[, .(count = sum(!is.na(start)),
which = paste(interval, collapse=", ")),
by = .(i.start, i.end)]
# i.start i.end count which
# 1: 0 4 1 int1
# 2: 5 9 1 int1
# 3: 10 14 2 int2, int3
# 4: 15 19 1 int3
# 5: 20 24 0 NA
# 6: 25 29 1 int4
I am creating correlations using R, with the following code:
Values<-read.csv(inputFile, header = TRUE)
O<-Values$Abundance_O
S<-Values$Abundance_S
cor(O,S)
pear_cor<-round(cor(O,S),4)
outfile<-paste(inputFile, ".jpg", sep = "")
jpeg(filename = outfile, width = 15, height = 10, units = "in", pointsize = 10, quality = 75, bg = "white", res = 300, restoreConsole = TRUE)
rx<-range(0,20000000)
ry<-range(0,200000)
plot(rx,ry, ylab="S", xlab="O", main="O vs S", type="n")
points(O,S, col="black", pch=3, lwd=1)
mtext(sprintf("%s %.4f", "pearson: ", pear_cor), adj=1, padj=0, side = 1, line = 4)
dev.off()
pear_cor
I now need to find the lower quartile for each set of data and exclude data that is within the lower quartile. I would then like to rewrite the data without those values and use the new column of data in the correlation analysis (because I want to threshold the data by the lower quartile). If there is a way I can write this so that it is easy to change the threshold by applying arguments from Java (as I have with the input file name) that's even better!
Thank you so much.
I have now implicated the answer below and that is working, however I need to keep the pairs of data together for the correlation. Here is an example of my data (from csv):
Abundance_O Abundance_S
3635900.752 1390.883073
463299.4622 1470.92626
359101.0482 989.1609251
284966.6421 3248.832403
415283.663 2492.231265
2076456.856 10175.48946
620286.6206 5074.268802
3709754.717 269.6856808
803321.0892 118.2935093
411553.0203 4772.499758
50626.83554 17.29893001
337428.8939 203.3536852
42046.61549 152.1321255
1372013.047 5436.783169
939106.3275 7080.770535
96618.01393 1967.834701
229045.6983 948.3087208
4419414.018 23735.19352
So I need to exclude both values in the row if one does not meet my quartile threshold (0.25 quartile). So if the quartile for O was 45000 then the row "42046.61549,152.1321255" would be removed. Is this possible? If I read in both columns as a dataframe can I search each column separately? Or find the quartiles and then input that value into code to remove the appropriate rows?
Thanks again, and sorry for the evolution of the question!
Please try to provide a reproducible example, but if you have data in a data.frame, you can subset it using the quantile function as the logical test. For instance, in the following data we want to select only rows from the dataframe where the value of the measured variable 'Val' is above the bottom quartile:
# set.seed so you can reproduce these values exactly on your system
set.seed(39856)
df <- data.frame( ID = 1:10 , Val = runif(10) )
df
ID Val
1 1 0.76487516
2 2 0.59755578
3 3 0.94584374
4 4 0.72179297
5 5 0.04513418
6 6 0.95772248
7 7 0.14566118
8 8 0.84898704
9 9 0.07246594
10 10 0.14136138
# Now to select only rows where the value of our measured variable 'Val' is above the bottom 25% quartile
df[ df$Val > quantile(df$Val , 0.25 ) , ]
ID Val
1 1 0.7648752
2 2 0.5975558
3 3 0.9458437
4 4 0.7217930
6 6 0.9577225
7 7 0.1456612
8 8 0.8489870
# And check the value of the bottom 25% quantile...
quantile(df$Val , 0.25 )
25%
0.1424363
Although this is an old question, I came across it during research of my own and I arrived at a solution that someone may be interested in.
I first defined a function which will convert a numerical vector into its quantile groups. Parameter n determines the quantile length (n = 4 for quartiles, n = 10 for deciles).
qgroup = function(numvec, n = 4){
qtile = quantile(numvec, probs = seq(0, 1, 1/n))
out = sapply(numvec, function(x) sum(x >= qtile[-(n+1)]))
return(out)
}
Function example:
v = rep(1:20)
> qgroup(v)
[1] 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3 4 4 4 4 4
Consider now the following data:
dt = data.table(
A0 = runif(100),
A1 = runif(100)
)
We apply qgroup() across the data to obtain two quartile group columns:
cols = colnames(dt)
qcols = c('Q0', 'Q1')
dt[, (qcols) := lapply(.SD, qgroup), .SDcols = cols]
head(dt)
> A0 A1 Q0 Q1
1: 0.72121846 0.1908863 3 1
2: 0.70373594 0.4389152 3 2
3: 0.04604934 0.5301261 1 3
4: 0.10476643 0.1108709 1 1
5: 0.76907762 0.4913463 4 2
6: 0.38265848 0.9291649 2 4
Lastly, we only include rows for which both quartile groups are above the first quartile:
dt = dt[Q0 + Q1 > 2]