Find position of elements of a dataframe inside other dataframe with R - r

I have the following dataframe (DF_A):
PARTY_ID PROBS_3001 PROBS_3002 PROBS_3003 PROBS_3004 PROBS_3005 PROBS_3006 PROBS_3007 PROBS_3008
1: 1000000 0.03 0.58 0.01 0.42 0.69 0.98 0.55 0.96
2: 1000001 0.80 0.37 0.10 0.95 0.77 0.69 0.23 0.07
3: 1000002 0.25 0.73 0.79 0.83 0.24 0.82 0.81 0.01
4: 1000003 0.10 0.96 0.53 0.59 0.96 0.10 0.98 0.76
5: 1000004 0.36 0.87 0.76 0.03 0.95 0.40 0.53 0.89
6: 1000005 0.15 0.78 0.24 0.21 0.03 0.87 0.67 0.64
And I have this other dataframe (DF_B):
V1 V2 V3 V4 PARTY_ID
1 0.58 0.69 0.96 0.98 1000000
2 0.69 0.77 0.80 0.95 1000001
3 0.79 0.81 0.82 0.83 1000002
4 0.76 0.96 0.96 0.98 1000003
5 0.76 0.87 0.89 0.95 1000004
6 0.64 0.67 0.78 0.87 1000005
I need to find the position of the elements of the DF_A in the DF_B to have something like this:
PARTY_ID P1 P2 P3 P4
1 1000000 3 6 9 7
...
Currently I'm working with match function but it takes a lot of time (I have 400K rows). I'm doing this:
i <- 1
while(i < nrow(DF_A)){
position <- match(DF_B[i,],DF_A[i,])
i <- i + 1
}
Although it works, it's very slow and I know that it's not the best answer to my problem. Can anyone help me please??

You can merge and then Map with a by group operation:
df_a2 <- df_a[setDT(df_b), on = "PARTY_ID"]
df_a3 <- df_a2[, c(PARTY_ID,
Map(f = function(x,y) which(x==y),
x = list(.SD[,names(df_a), with = FALSE]),
y = .SD[, paste0("V",1:4), with = FALSE])), by = 1:nrow(df_a2)]
setnames(df_a3, paste0("V",1:5), c("PARTY_ID", paste0("P", 1:4)))[,nrow:=NULL]
df_a3
# PARTY_ID P1 P2 P3 P4
#1: 1000000 3 6 9 7
#2: 1000001 7 6 2 5
#3: 1000002 4 8 7 5
#4: 1000003 9 3 3 8
#5: 1000003 9 6 6 8
#6: 1000004 4 3 9 6
#7: 1000005 9 8 3 7

Here is an example on 1 milion rows with two columns. It takes 14 ms on my computer.
# create data tables with matching ids but on different positions
x <- as.data.table(data.frame(id=sample(c(1:1000000), 1000000, replace=FALSE), y=sample(LETTERS, 1000000, replace=TRUE)))
y <- as.data.table(data.frame(id=sample(c(1:1000000), 1000000, replace=FALSE), z=sample(LETTERS, 1000000, replace=TRUE)))
# add column to both data tables which will store the position in x and y
x$x_row_nr <- 1:nrow(x)
y$y_row_nr <- 1:nrow(y)
# set key in both data frames using matching columns name
setkey(x, "id")
setkey(y, "id")
# merge data tables into one
z <- merge(x,y)
# now you just use this to extract what is the position
# of 100 hundreth record in x data table in y data table
z[x_row_nr==100, y_row_nr]
z will contain matching row records from both datasets with there columns attached.

Related

Replace values from multiple columns based on value from adjacent column

# Create a data frame
> df <- data.frame(a = rnorm(7), b = rnorm(7), c = rnorm(7), threshold = rnorm(7))
> df <- round(abs(df), 2)
>
> df
a b c threshold
1 1.17 0.27 1.26 0.19
2 1.41 1.57 1.23 0.97
3 0.16 0.11 0.35 1.34
4 0.03 0.04 0.10 1.50
5 0.23 1.10 2.68 0.45
6 0.99 1.36 0.17 0.30
7 0.28 0.68 1.22 0.56
>
>
# Replace values in columns a, b, and c with NA if > value in threshold
> df[1:3][df[1:3] > df[4]] <- "NA"
Error in Ops.data.frame(df[1:3], df[4]) :
‘>’ only defined for equally-sized data frames
There could be some obvious solutions that I am incapable of producing. The intent is to replace values in columns "a", "b", and "c" with NA if the value is larger than that in "threshold". And I need to do that row-by-row.
If I had done it right, the df would look like this:
a b c threshold
1 NA NA NA 0.19
2 NA NA NA 0.97
3 0.16 0.11 0.35 1.34
4 0.03 0.04 0.10 1.50
5 0.23 NA NA 0.45
6 NA NA 0.17 0.30
7 0.28 NA NA 0.56
I had also tried the apply() approach but to no avail. Can you help, please??
You should use dplyr for most of such use cases.
One way below:
> set.seed(10)
> df <- data.frame(a = rnorm(7), b = rnorm(7), c = rnorm(7), threshold = rnorm(7))
> df <- round(abs(df), 2)
> df
a b c threshold
1 0.02 0.36 0.74 2.19
2 0.18 1.63 0.09 0.67
3 1.37 0.26 0.95 2.12
4 0.60 1.10 0.20 1.27
5 0.29 0.76 0.93 0.37
6 0.39 0.24 0.48 0.69
7 1.21 0.99 0.60 0.87
>
> df %>%
+ mutate_at(vars(a:c), ~ifelse(.x > df$threshold, NA, .x))
a b c threshold
1 0.02 0.36 0.74 2.19
2 0.18 NA 0.09 0.67
3 1.37 0.26 0.95 2.12
4 0.60 1.10 0.20 1.27
5 0.29 NA NA 0.37
6 0.39 0.24 0.48 0.69
7 NA NA 0.60 0.87
You can use apply function across dataframe
df[,c(1:3)]<- apply(df[,c(1:3),drop=F], 2, function(x){ ifelse(x>df[,4],NA,x)})
The problem with your code was the usage of df[4] instead of df[, 4]. The difference is that df[4] returns a data.frame with one column and df[, 4] returns a vector.
That's why
df[1:3] > df[4]
returns
error in Ops.data.frame(df[1:3], df[4]) :
‘>’ only defined for equally-sized data frames
While this works as expected
df[1:3][df[1:3] > df[, 4]] <- NA
df
# a b c threshold
#1 0.63 0.74 NA 0.78
#2 NA NA 0.04 0.07
#3 0.84 0.31 0.02 1.99
#4 NA NA NA 0.62
#5 NA NA NA 0.06
#6 NA NA NA 0.16
#7 0.49 NA 0.92 1.47
data
set.seed(1)
df <- data.frame(a = rnorm(7), b = rnorm(7), c = rnorm(7), threshold = rnorm(7))
df <- round(abs(df), 2)
You can use a for-loop like this:
for(i in 1:(ncol(df)-1)){
df[, i] <- ifelse(df[, i] > df[, 4], NA, df[, i])
}

Sorting values in dataframe by order of values in another dataframe R

I would like to sort values in columns of the xy1 dataframe, based on the increasing order of values in columns of the xy dataframe.
x <- c(3,1,7,45,22,2)
y <- c(23,65,1,23,2,11)
xy <- data.frame(x,y)
x1 <- c(0.34,0.3,0.7,0.22,0.67,0.87)
y1 <- c(0.4,0.13,0.17,0.72,0.61,0.7)
xy1 <- data.frame(x1,y1)
> xy
x y
1 3 23
2 1 65
3 7 1
4 45 23
5 22 2
6 2 11
> xy1
x1 y1
1 0.34 0.40
2 0.30 0.13
3 0.70 0.17
4 0.22 0.72
5 0.67 0.61
6 0.87 0.70
The following is a new data.frame result that I desire - note it deals with repeated observations (two the same values in y). x1 and y1 are now sorted according to the order of values in each column of xy dataframe.
x1 y1
1 0.30 0.17
2 0.87 0.61
3 0.34 0.70
4 0.70 0.40
5 0.67 0.72
6 0.22 0.13
You can use the order function to get the sorting order of a vector.
x <- c(3,1,7,45,22,2)
y <- c(23,65,1,23,2,11)
xy <- data.frame(x,y)
x1 <- c(0.34,0.3,0.7,0.22,0.67,0.87)
y1 <- c(0.4,0.13,0.17,0.72,0.61,0.7)
xy1 <- data.frame(x1,y1)
result <- data.frame(x1[order(x)], y1[order(y)])
result
This produces
x1.order.x.. y1.order.y..
1 0.30 0.17
2 0.87 0.61
3 0.34 0.70
4 0.70 0.40
5 0.67 0.72
6 0.22 0.13
You can beautify the output by setting the column names in the result:
data.frame(x1=x1[order(x)], y1=y1[order(y)])
Now if you don't want to manually type in everything but have two data frames with the same dimensions that you can use this one-liner
sapply(1:ncol(xy1), function(i) {xy1[order(xy[,i]), i]})
which produces
[,1] [,2]
[1,] 0.30 0.17
[2,] 0.87 0.61
[3,] 0.34 0.70
[4,] 0.70 0.40
[5,] 0.67 0.72
[6,] 0.22 0.13
As this is based on ordering corresponding columns on both datasets, Map can be used
xy1[] <- Map(function(x,y) x[order(y)], xy1, xy)
xy1
# x1 y1
#1 0.30 0.17
#2 0.87 0.61
#3 0.34 0.70
#4 0.70 0.40
#5 0.67 0.72
#6 0.22 0.13
Or another option is to order based on the col of 'xy', 'xy'
xy1[] <- as.matrix(xy1)[order(col(xy), xy)]
xy1
# x1 y1
#1 0.30 0.17
#2 0.87 0.61
#3 0.34 0.70
#4 0.70 0.40
#5 0.67 0.72
#6 0.22 0.13
You could try this:
library(tidyverse)
df_1 <- xy %>%
bind_cols(xy1) %>%
arrange(x) %>%
select(x1)
df_2 <- xy %>%
bind_cols(xy1) %>%
arrange(y) %>%
select(y1)
df <- bind_cols(df_1, df_2)
Which returns:
# A tibble: 6 x 2
x1 y1
<dbl> <dbl>
1 0.30 0.17
2 0.87 0.61
3 0.34 0.70
4 0.70 0.40
5 0.67 0.72
6 0.22 0.13
Basically just arrange x1 and y1 by x and y separately, then combine x1 and y1.

R :How to execute FOR loop for Kmeans

I have an input file with Format as below :
RN KEY MET1 MET2 MET3 MET4
1 1 0.11 0.41 0.91 0.17
2 1 0.94 0.02 0.17 0.84
3 1 0.56 0.64 0.46 0.7
4 1 0.57 0.23 0.81 0.09
5 2 0.82 0.67 0.39 0.63
6 2 0.99 0.90 0.34 0.84
7 2 0.83 0.01 0.70 0.29
I have to execute Kmeans in R -separately for DF with Key=1 and Key=2 and so on...
Afterwards the final output CSV should look like
RN KEY MET1 MET2 MET3 MET4 CLST
1 1 0.11 0.41 0.91 0.17 1
2 1 0.94 0.02 0.17 0.84 1
3 1 0.56 0.64 0.46 0.77 2
4 1 0.57 0.23 0.81 0.09 2
5 2 0.82 0.67 0.39 0.63 1
6 2 0.99 0.90 0.34 0.84 2
7 2 0.83 0.01 0.70 0.29 2
Ie Key=1 is to be treated as separate DF and Key=2 is be treated as separate DF and so on...
Finally the output of clustering (of each DF)is to be combined with Key column first (since Key cannot participate in clustering) and then combined with each different DF for final output
In the above example :
DF1 is
KEY MET1 MET2 MET3 MET4
1 0.11 0.41 0.91 0.17
1 0.94 0.02 0.17 0.84
1 0.56 0.64 0.46 0.77
1 0.57 0.23 0.81 0.09
DF2 is
KEY MET1 MET2 MET3 MET4
2 0.82 0.67 0.39 0.63
2 0.99 0.90 0.34 0.84
2 0.83 0.01 0.70 0.29
Please suggest how to achieve in R
Psuedo code :
n<-Length(unique(Mydf$key))
for i=1 to n
{
#fetch partial df for each value of Key and run K means
dummydf<-subset(mydf,mydf$key=i
KmeansIns<-Kmeans(dummydf,2)
# combine with cluster result
dummydf<-data.frame(dummydf,KmeansIns$cluster)
# combine each smalldf into final Global DF
finaldf<-data.frame(finaldf,dummydf)
}Next i
#Now we have finaldf then it can be written to file
I think the easiest way would be to use by. Something along the lines of
by(data = DF, INDICES = DF$KEY, FUN = function(x) {
# your clustering code here
})
where x is a subset of your DF for each KEY.
A solution using data.tables.
library(data.table)
setDT(DF)[,CLST:=kmeans(.SD, centers=2)$clust, by=KEY, .SDcols=3:6]
DF
# RN KEY MET1 MET2 MET3 MET4 CLST
# 1: 1 1 0.11 0.41 0.91 0.17 2
# 2: 2 1 0.94 0.02 0.17 0.84 1
# 3: 3 1 0.56 0.64 0.46 0.70 1
# 4: 4 1 0.57 0.23 0.81 0.09 2
# 5: 5 2 0.82 0.67 0.39 0.63 2
# 6: 6 2 0.99 0.90 0.34 0.84 2
# 7: 7 2 0.83 0.01 0.70 0.29 1
#Read data
mdf <- read.table("mydat.txt", header=T)
#Convert to list based on KEY column
mls <- split(mdf, f=mdf$KEY)
#Define columns to use in clustering
myv <- paste0("MET", 1:4)
#Cluster each df item in list : modify kmeans() args as appropriate
kls <- lapply(X=mls, FUN=function(x){x$clust <- kmeans(x[, myv],
centers=2)$cluster ; return(x)})
#Make final "global" df
finaldf <- do.call(rbind, kls)

How to store multiple vectors in elements of a list?

I have generated a file that looks like this (here is an extract, roughly each one of two files with names having columns 1 and 2 in common are multiplied by a different parameter and the best fit, i.e. the lowest chi2, is returned in column 3 with the appropriate parameters in columns 4 and 5):
10 05 0.42 0.13 0.01
10 10 0.30 0.12 0.01
10 15 0.25 0.11 0.07
15 05 0.29 0.12 0.01
15 10 0.25 0.11 0.06
15 15 0.23 0.10 0.02
20 05 0.25 0.11 0.03
20 10 0.23 0.12 0.04
20 15 0.23 0.13 0.05
25 05 0.23 0.10 0.03
25 10 0.23 0.10 0.08
25 15 0.24 0.09 0.05
I am starting/learning to use lists as my codes are really to slow using for loops (currently I am using 4 for loops so it's insanely long), and I don't know enough to re-write my optimisation code so it does not take 8 hours to work. So instead, to reorganise the output I was wondering if it would be possible to create a list, say mytemplist, that reads:
> mytemplist
$5
[1] 10 15 20 25
[2] 0.42 0.29 0.25 0.23
[3] 0.13 0.12 0.11 0.10
[4] 0.01 0.01 0.03 0.03
$10
[1] 10 15 20 25
[2] 0.30 0.25 0.23 0.23
[3] 0.12 0.11 0.12 0.10
[4] 0.01 0.06 0.04 0.08
$15
[1] 10 15 20 25
[2] 0.25 0.23 0.23 0.24
[3] 0.11 0.10 0.13 0.09
[4] 0.07 0.02 0.05 0.05
I have looked at questions about lists and I could only sort that out by creating lists within lists which is not helping here.
EDIT:
the accepted answer replies to the specific question above, to answer #rawr post I join how the file is generated (it's not pretty, I am not using opt so far as I will evolve the code to optimise with bigger freedom around the data points):
note: typical file to read are 2-column file (just lists of numbers) and named a10b05s and a10b05t
dataname is also a 2-column file
in those 3 files the first column is the same and represents the pivots
need to find par[1] and par[2] such that par[1]*a10b05s + par[2]*a10b05t best fit data
par <- rep(NA, 2)
pivot <- read.table(dataname)[[1]]
data2fit <- read.table(dataname)[[2]]
for (i in 1:10){
vala <- 10+5*(i-1)
namei <- paste("a", vala, sep="")
for (j in 1:10){
#creates a coordinates for storage
cglobal <- (i-1) * 10 + j
valb <- 5+5*(j-1)
namej1 <- paste(namei, "b", valb, "s", sep="")
namej2 <- paste(namei, "b", valb, "t", sep="")
infile1 <- read.table(namej1)
infile2 <- read.table(namej2)
# infile1 prominent wrt infile2 so first quick determination of par1
tempspace1 <- seq(0.001, 0.009, 0.001)
par1_s1 <- c(tempspace1, tempspace1*10, tempspace1*100)
opt1_par1 <- rep(NA, length(par1))
# set a pivot for comparison at position named temppivot find par1 wrt temppivot
for(k in 1:length(par1){
opt1_par1[k] <- abs(par1_s1[k]*infile1[[1]][temppivot] - data2fit[temppivot])
}
par[1] <- par1_s1[match(min(opt1_par1)), opt1_par1]
# set a space for a finer fit for par[1]
par1_s2 <- seq(par[1]-5*par[1]/10, par[1]+5*par[1]/10, par[1]/100)
# set a space to fit par[2] note that there is an option in the code to choose btw 0.001-0.01, 0.01-0.1 etc.
tempspace2 <- seq(0.001, 0.009, 0.0001)
par2 <- c(tempspace2, tempspace2*10, tempspace2*100)
chi2 <- rep(NA, length(par1_s2)*length(par2))
#data2fit
for(z in 1:length(par1_s2)){
for(w in 1:length(par2)){
par[1] <- par1_s2[z]
par[2] <- par2[w]
thesum <- rep(NA, length(pivot))
for(h in 1:length(pivot)){
c1 <- pivot[h]
thesum[h] <- par[1] * infile[[1]][c1] + par[2] * infile2[[1]][c1]
}
c2 <- (z-1) * length(par2) + w
chi2[c2] <- sum((thesum-data2fit)^2/thesum)
}
}
whichbestfit <- match(min(chi2), chi2)
chi2min <- min(chi2)
localparfinder <- function(x){
temp1 <- trunc(x/length(par2)) + 1
temp2 <- x - (temp1 -1) * length(par2)
y <- c(par1_s2[temp1], par2[temp2])
}
par <- localparfinder(whichbestfit)
# creates the table of the original post
storage[cglobal,] <- c(vala, valb, chi2min, par[1], par[2])
}
}
write.table(storage, file=paste("storage_", format(Sys.time(), "%d%b_%H%M"), ".dat", sep="")
You can use by and transpose, like this:
by(mydf[-2], mydf[[2]], t)
# mydf[[2]]: 5
# 1 4 7 10
# V1 10.00 15.00 20.00 25.00
# V3 0.42 0.29 0.25 0.23
# V4 0.13 0.12 0.11 0.10
# V5 0.01 0.01 0.03 0.03
# -----------------------------------------------------------
# mydf[[2]]: 10
# 2 5 8 11
# V1 10.00 15.00 20.00 25.00
# V3 0.30 0.25 0.23 0.23
# V4 0.12 0.11 0.12 0.10
# V5 0.01 0.06 0.04 0.08
# -----------------------------------------------------------
# mydf[[2]]: 15
# 3 6 9 12
# V1 10.00 15.00 20.00 25.00
# V3 0.25 0.23 0.23 0.24
# V4 0.11 0.10 0.13 0.09
# V5 0.07 0.02 0.05 0.05
The result of the above is a list with a class of by. If you used unclass on it, it would be similar to the split + lapply approach.
Here's one possibility:
lst <- split(df[-2], df$V2) # split according to colum 2 and drop column 2 in the output
lst <- lapply(lst, t) # transpose each list element
lst
# $`5`
# 1 4 7 10
#V1 10.00 15.00 20.00 25.00
#V3 0.42 0.29 0.25 0.23
#V4 0.13 0.12 0.11 0.10
#V5 0.01 0.01 0.03 0.03
#
#$`10`
# 2 5 8 11
#V1 10.00 15.00 20.00 25.00
#V3 0.30 0.25 0.23 0.23
#V4 0.12 0.11 0.12 0.10
#V5 0.01 0.06 0.04 0.08
#
#$`15`
# 3 6 9 12
#V1 10.00 15.00 20.00 25.00
#V3 0.25 0.23 0.23 0.24
#V4 0.11 0.10 0.13 0.09
#V5 0.07 0.02 0.05 0.05
If you like it more compact, you could also nest the split and lapply like this:
lst <- lapply(split(df[-2], df$V2), t)

Weighted row average in time series join

Hello I'm looking for the cleanest/fastest way to solve the following problem:
My setup looks like this
library(data.table)
set.seed(1234)
DT1 <- data.table(replicate(12,runif(5)))
setnames(DT1,LETTERS[1:12])
DT1[,time:=100]
DT2 <- data.table(time=rep(100,12), grp=rep(c("X","Y","Z"),each=4),
sub=LETTERS[1:12], weight=sample(1:100,12))
options(digits=2)
DT1
A B C D E F G H I J K L time
1: 0.11 0.6403 0.69 0.84 0.32 0.811 0.46 0.76 0.55 0.50 0.074 0.50 100
2: 0.62 0.0095 0.54 0.29 0.30 0.526 0.27 0.20 0.65 0.68 0.310 0.49 100
3: 0.61 0.2326 0.28 0.27 0.16 0.915 0.30 0.26 0.31 0.48 0.717 0.75 100
4: 0.62 0.6661 0.92 0.19 0.04 0.831 0.51 0.99 0.62 0.24 0.505 0.17 100
5: 0.86 0.5143 0.29 0.23 0.22 0.046 0.18 0.81 0.33 0.77 0.153 0.85 100
> DT2
time grp sub weight
1: 100 X A 87
2: 100 X B 5
3: 100 X C 32
4: 100 X D 2
5: 100 Y E 23
6: 100 Y F 68
7: 100 Y G 29
8: 100 Y H 48
9: 100 Z I 99
10: 100 Z J 52
11: 100 Z K 11
12: 100 Z L 80
I want to compute a weighted average (per row) of the columns of DT1 by referencing the groups, subclasses & weights from DT2, while joining per time point.
E.g. so DT1 then gets columns X,Y & Z bound to it, so in this case the column X of the first row is 87*0.11 + 5*0.64 + 32*0.69 + 2*0.84 / (87 + 5 + 32 + 2)
There are millions of rows in DT1 with different time points, so memory might be a limiting factor though
Any advice would be much appreciated!
Something like this perhaps:
library(reshape2)
setkey(DT2, time, sub)
DT2[melt(DT1, id.var = 'time')[, row := 1:.N, by = list(time, variable)]][,
sum(weight * value) / sum(weight), by = list(time, grp, row)]
# time grp row V1
# 1: 100 X 1 0.29
# 2: 100 X 2 0.57
# 3: 100 X 3 0.51
# 4: 100 X 4 0.69
# 5: 100 X 5 0.69
# 6: 100 Y 1 0.67
# 7: 100 Y 2 0.36
# 8: 100 Y 3 0.52
# 9: 100 Y 4 0.71
#10: 100 Y 5 0.31
#11: 100 Z 1 0.50
#12: 100 Z 2 0.59
#13: 100 Z 3 0.51
#14: 100 Z 4 0.39
#15: 100 Z 5 0.59
You can also reshape the above result if you like:
# assuming you called the above table "res"
dcast.data.table(res, row + time ~ grp)
#Using 'V1' as value column. Use 'value.var' to override
# row time X Y Z
#1: 1 100 0.29 0.67 0.50
#2: 2 100 0.57 0.36 0.59
#3: 3 100 0.51 0.52 0.51
#4: 4 100 0.69 0.71 0.39
#5: 5 100 0.69 0.31 0.59

Resources