Replace values from multiple columns based on value from adjacent column - r

# Create a data frame
> df <- data.frame(a = rnorm(7), b = rnorm(7), c = rnorm(7), threshold = rnorm(7))
> df <- round(abs(df), 2)
>
> df
a b c threshold
1 1.17 0.27 1.26 0.19
2 1.41 1.57 1.23 0.97
3 0.16 0.11 0.35 1.34
4 0.03 0.04 0.10 1.50
5 0.23 1.10 2.68 0.45
6 0.99 1.36 0.17 0.30
7 0.28 0.68 1.22 0.56
>
>
# Replace values in columns a, b, and c with NA if > value in threshold
> df[1:3][df[1:3] > df[4]] <- "NA"
Error in Ops.data.frame(df[1:3], df[4]) :
‘>’ only defined for equally-sized data frames
There could be some obvious solutions that I am incapable of producing. The intent is to replace values in columns "a", "b", and "c" with NA if the value is larger than that in "threshold". And I need to do that row-by-row.
If I had done it right, the df would look like this:
a b c threshold
1 NA NA NA 0.19
2 NA NA NA 0.97
3 0.16 0.11 0.35 1.34
4 0.03 0.04 0.10 1.50
5 0.23 NA NA 0.45
6 NA NA 0.17 0.30
7 0.28 NA NA 0.56
I had also tried the apply() approach but to no avail. Can you help, please??

You should use dplyr for most of such use cases.
One way below:
> set.seed(10)
> df <- data.frame(a = rnorm(7), b = rnorm(7), c = rnorm(7), threshold = rnorm(7))
> df <- round(abs(df), 2)
> df
a b c threshold
1 0.02 0.36 0.74 2.19
2 0.18 1.63 0.09 0.67
3 1.37 0.26 0.95 2.12
4 0.60 1.10 0.20 1.27
5 0.29 0.76 0.93 0.37
6 0.39 0.24 0.48 0.69
7 1.21 0.99 0.60 0.87
>
> df %>%
+ mutate_at(vars(a:c), ~ifelse(.x > df$threshold, NA, .x))
a b c threshold
1 0.02 0.36 0.74 2.19
2 0.18 NA 0.09 0.67
3 1.37 0.26 0.95 2.12
4 0.60 1.10 0.20 1.27
5 0.29 NA NA 0.37
6 0.39 0.24 0.48 0.69
7 NA NA 0.60 0.87

You can use apply function across dataframe
df[,c(1:3)]<- apply(df[,c(1:3),drop=F], 2, function(x){ ifelse(x>df[,4],NA,x)})

The problem with your code was the usage of df[4] instead of df[, 4]. The difference is that df[4] returns a data.frame with one column and df[, 4] returns a vector.
That's why
df[1:3] > df[4]
returns
error in Ops.data.frame(df[1:3], df[4]) :
‘>’ only defined for equally-sized data frames
While this works as expected
df[1:3][df[1:3] > df[, 4]] <- NA
df
# a b c threshold
#1 0.63 0.74 NA 0.78
#2 NA NA 0.04 0.07
#3 0.84 0.31 0.02 1.99
#4 NA NA NA 0.62
#5 NA NA NA 0.06
#6 NA NA NA 0.16
#7 0.49 NA 0.92 1.47
data
set.seed(1)
df <- data.frame(a = rnorm(7), b = rnorm(7), c = rnorm(7), threshold = rnorm(7))
df <- round(abs(df), 2)

You can use a for-loop like this:
for(i in 1:(ncol(df)-1)){
df[, i] <- ifelse(df[, i] > df[, 4], NA, df[, i])
}

Related

Extracting dataframe values using indices in R

I have 100+ files and have starting and ending coordinates for each file. So based on starting and ending coordinates, I want to extract the regions from all data sets and want to store in file. I have used following approach but its not giving me the expected out put.
startco have the starting indices of 1st 2nd 3rd file respectively and endco have ending indices of 1st 2nd 3rd file respectively. And if the indices is going beyond the files indices want to put NA
Example:
startco<-c(3,4,1)
endco<-c(5,6,2)
ctc<-c(1,2,3)
for (hm0 in 1:length(ctc)) {
for (hm1 in 1:length(startco)) {
for (hm2 in 1:length(endco)) {
methd1<-read.table( paste0("path/to folder/","file_",ctc[hm0],".txt"))
methd2<- methd1[,startco[hm1]:endco[hm2]]
}
}
}
File_1.txt
V1 V2 V3 V4 V5
41 42 43 45 46
0.31 0.21 0.87 0.65 0.54
0.32 0.28 0.74 0.87 0.65
0.19 0.12 0.99 0.99 0.89
File_2.txt
V1 V2 V3 V4 V5
12 24 13 14 16
0.89 0.78 0.50 0.22 0.34
0.54 0.78 0.50 0.34 0.41
0.78 0.54 0.66 0.26 0.14
File_3.txt
V1 V2 V3 V4 V5
1 2 3 5 6
0.20 0.40 0.50 0.49 0.52
Expected output :
43 45 46
0.87 0.65 0.54
0.74 0.87 0.65
0.99 0.99 0.89
0.22 0.34 NA
0.34 0.41 NA
0.99 0.89 NA
1 2
0.20 0.40
in Base R you could do:
fun <- function(path, start, end){
id <- basename(path)
dat <- read.table(path, header = TRUE)
p <- ncol(dat)
n <- nrow(dat)
neg <- if(start<0) -start else 0
add <- matrix(nrow = n, ncol = neg)
if (start < 1) start <- 1
if (end > p) end <- p
d <- cbind(add, dat[, start:end])
names(d) <- paste0('X', seq(ncol(d)))
cbind(id,r = seq(nrow(d)), d)
}
startco<-c(3,4,-2) # TAKES NEGATIVE INDICES
endco<-c(5,6,2)
ctc<-c(1,2,3)
files <- file.path('path/to/folder', ctc)
A <- Map(fun, files, startco, endco)
Reduce(function(x, y)merge(x,y, all =TRUE), A)[, -(1:2)]
X1 X2 X3 X4
1 43.00 45.00 46.00 NA
2 0.87 0.65 0.54 NA
3 0.74 0.87 0.65 NA
4 0.99 0.99 0.89 NA
5 14.00 16.00 NA NA
6 0.22 0.34 NA NA
7 0.34 0.41 NA NA
8 0.26 0.14 NA NA
9 NA NA 1.00 2.0
10 NA NA 0.20 0.4
The one with no negatives
startco<-c(3,4,1)
B <- Map(fun, files, startco, endco)
Reduce(function(x, y)merge(x,y, all =TRUE), B)[, -(1:2)]
X1 X2 X3
1 43.00 45.00 46.00
2 0.87 0.65 0.54
3 0.74 0.87 0.65
4 0.99 0.99 0.89
5 14.00 16.00 NA
6 0.22 0.34 NA
7 0.34 0.41 NA
8 0.26 0.14 NA
9 1.00 2.00 NA
10 0.20 0.40 NA
I would use a readfun,
readfun <- \(i, s, e) {
stopifnot(s != 0)
r <- read.table(paste0("foo1/", "file_", i, ".txt"), header=TRUE)
if (e > ncol(r)) { ## inserts cols to the right if e > ncol
e1 <- e - ncol(r)
nm <- paste0('V', as.numeric(substring(colnames(r), 2)[ncol(r)]) + seq_len(e1))
m <- matrix(NA_real_, nrow(r), e1, dimnames=list(NULL, nm))
r <- cbind(r, m)
}
if (s < 0) { ## inserts cols to the left if s < 0
m <- matrix(NA_real_, nrow(r), -s)
r <- cbind(m, r)
e <- e + -s
s <- 1
}
out <- r[, s:e]
unname(as.matrix(out))
}
in Map.
ctc <- c(1, 2, 3); startco <- c(3, 4, -2); endco <- c(5, 6, 2)
Map(readfun, ctc, startco, endco)
# [[1]]
# [,1] [,2] [,3]
# [1,] 43.00 45.00 46.00
# [2,] 0.87 0.65 0.54
# [3,] 0.74 0.87 0.65
# [4,] 0.99 0.99 0.89
#
# [[2]]
# [,1] [,2] [,3]
# [1,] 14.00 16.00 NA
# [2,] 0.22 0.34 NA
# [3,] 0.34 0.41 NA
# [4,] 0.26 0.14 NA
#
# [[3]]
# [,1] [,2] [,3] [,4]
# [1,] NA NA 1.0 2.0
# [2,] NA NA 0.2 0.4
Data:
dir.create('foo1')
write.table(read.table(header=TRUE, text='
V1 V2 V3 V4 V5
41 42 43 45 46
0.31 0.21 0.87 0.65 0.54
0.32 0.28 0.74 0.87 0.65
0.19 0.12 0.99 0.99 0.89'), './foo1/file_1.txt', row.names=F)
write.table(read.table(header=TRUE, text='
V1 V2 V3 V4 V5
12 24 13 14 16
0.89 0.78 0.50 0.22 0.34
0.54 0.78 0.50 0.34 0.41
0.78 0.54 0.66 0.26 0.14'), './foo1/file_2.txt', row.names=F)
write.table(read.table(header=TRUE, text='
V1 V2 V3 V4 V5
1 2 3 5 6
0.20 0.40 0.50 0.49 0.52 '), './foo1/file_3.txt', row.names=F)

Find position of elements of a dataframe inside other dataframe with R

I have the following dataframe (DF_A):
PARTY_ID PROBS_3001 PROBS_3002 PROBS_3003 PROBS_3004 PROBS_3005 PROBS_3006 PROBS_3007 PROBS_3008
1: 1000000 0.03 0.58 0.01 0.42 0.69 0.98 0.55 0.96
2: 1000001 0.80 0.37 0.10 0.95 0.77 0.69 0.23 0.07
3: 1000002 0.25 0.73 0.79 0.83 0.24 0.82 0.81 0.01
4: 1000003 0.10 0.96 0.53 0.59 0.96 0.10 0.98 0.76
5: 1000004 0.36 0.87 0.76 0.03 0.95 0.40 0.53 0.89
6: 1000005 0.15 0.78 0.24 0.21 0.03 0.87 0.67 0.64
And I have this other dataframe (DF_B):
V1 V2 V3 V4 PARTY_ID
1 0.58 0.69 0.96 0.98 1000000
2 0.69 0.77 0.80 0.95 1000001
3 0.79 0.81 0.82 0.83 1000002
4 0.76 0.96 0.96 0.98 1000003
5 0.76 0.87 0.89 0.95 1000004
6 0.64 0.67 0.78 0.87 1000005
I need to find the position of the elements of the DF_A in the DF_B to have something like this:
PARTY_ID P1 P2 P3 P4
1 1000000 3 6 9 7
...
Currently I'm working with match function but it takes a lot of time (I have 400K rows). I'm doing this:
i <- 1
while(i < nrow(DF_A)){
position <- match(DF_B[i,],DF_A[i,])
i <- i + 1
}
Although it works, it's very slow and I know that it's not the best answer to my problem. Can anyone help me please??
You can merge and then Map with a by group operation:
df_a2 <- df_a[setDT(df_b), on = "PARTY_ID"]
df_a3 <- df_a2[, c(PARTY_ID,
Map(f = function(x,y) which(x==y),
x = list(.SD[,names(df_a), with = FALSE]),
y = .SD[, paste0("V",1:4), with = FALSE])), by = 1:nrow(df_a2)]
setnames(df_a3, paste0("V",1:5), c("PARTY_ID", paste0("P", 1:4)))[,nrow:=NULL]
df_a3
# PARTY_ID P1 P2 P3 P4
#1: 1000000 3 6 9 7
#2: 1000001 7 6 2 5
#3: 1000002 4 8 7 5
#4: 1000003 9 3 3 8
#5: 1000003 9 6 6 8
#6: 1000004 4 3 9 6
#7: 1000005 9 8 3 7
Here is an example on 1 milion rows with two columns. It takes 14 ms on my computer.
# create data tables with matching ids but on different positions
x <- as.data.table(data.frame(id=sample(c(1:1000000), 1000000, replace=FALSE), y=sample(LETTERS, 1000000, replace=TRUE)))
y <- as.data.table(data.frame(id=sample(c(1:1000000), 1000000, replace=FALSE), z=sample(LETTERS, 1000000, replace=TRUE)))
# add column to both data tables which will store the position in x and y
x$x_row_nr <- 1:nrow(x)
y$y_row_nr <- 1:nrow(y)
# set key in both data frames using matching columns name
setkey(x, "id")
setkey(y, "id")
# merge data tables into one
z <- merge(x,y)
# now you just use this to extract what is the position
# of 100 hundreth record in x data table in y data table
z[x_row_nr==100, y_row_nr]
z will contain matching row records from both datasets with there columns attached.

Sorting values in dataframe by order of values in another dataframe R

I would like to sort values in columns of the xy1 dataframe, based on the increasing order of values in columns of the xy dataframe.
x <- c(3,1,7,45,22,2)
y <- c(23,65,1,23,2,11)
xy <- data.frame(x,y)
x1 <- c(0.34,0.3,0.7,0.22,0.67,0.87)
y1 <- c(0.4,0.13,0.17,0.72,0.61,0.7)
xy1 <- data.frame(x1,y1)
> xy
x y
1 3 23
2 1 65
3 7 1
4 45 23
5 22 2
6 2 11
> xy1
x1 y1
1 0.34 0.40
2 0.30 0.13
3 0.70 0.17
4 0.22 0.72
5 0.67 0.61
6 0.87 0.70
The following is a new data.frame result that I desire - note it deals with repeated observations (two the same values in y). x1 and y1 are now sorted according to the order of values in each column of xy dataframe.
x1 y1
1 0.30 0.17
2 0.87 0.61
3 0.34 0.70
4 0.70 0.40
5 0.67 0.72
6 0.22 0.13
You can use the order function to get the sorting order of a vector.
x <- c(3,1,7,45,22,2)
y <- c(23,65,1,23,2,11)
xy <- data.frame(x,y)
x1 <- c(0.34,0.3,0.7,0.22,0.67,0.87)
y1 <- c(0.4,0.13,0.17,0.72,0.61,0.7)
xy1 <- data.frame(x1,y1)
result <- data.frame(x1[order(x)], y1[order(y)])
result
This produces
x1.order.x.. y1.order.y..
1 0.30 0.17
2 0.87 0.61
3 0.34 0.70
4 0.70 0.40
5 0.67 0.72
6 0.22 0.13
You can beautify the output by setting the column names in the result:
data.frame(x1=x1[order(x)], y1=y1[order(y)])
Now if you don't want to manually type in everything but have two data frames with the same dimensions that you can use this one-liner
sapply(1:ncol(xy1), function(i) {xy1[order(xy[,i]), i]})
which produces
[,1] [,2]
[1,] 0.30 0.17
[2,] 0.87 0.61
[3,] 0.34 0.70
[4,] 0.70 0.40
[5,] 0.67 0.72
[6,] 0.22 0.13
As this is based on ordering corresponding columns on both datasets, Map can be used
xy1[] <- Map(function(x,y) x[order(y)], xy1, xy)
xy1
# x1 y1
#1 0.30 0.17
#2 0.87 0.61
#3 0.34 0.70
#4 0.70 0.40
#5 0.67 0.72
#6 0.22 0.13
Or another option is to order based on the col of 'xy', 'xy'
xy1[] <- as.matrix(xy1)[order(col(xy), xy)]
xy1
# x1 y1
#1 0.30 0.17
#2 0.87 0.61
#3 0.34 0.70
#4 0.70 0.40
#5 0.67 0.72
#6 0.22 0.13
You could try this:
library(tidyverse)
df_1 <- xy %>%
bind_cols(xy1) %>%
arrange(x) %>%
select(x1)
df_2 <- xy %>%
bind_cols(xy1) %>%
arrange(y) %>%
select(y1)
df <- bind_cols(df_1, df_2)
Which returns:
# A tibble: 6 x 2
x1 y1
<dbl> <dbl>
1 0.30 0.17
2 0.87 0.61
3 0.34 0.70
4 0.70 0.40
5 0.67 0.72
6 0.22 0.13
Basically just arrange x1 and y1 by x and y separately, then combine x1 and y1.

Remove values based on specific relation to previous value in same column

I have daily stock return data for several companies and need to remove those values, which have a specific relation to the previous (= the day before) return value.
In a mathematical formula it looks something like that:(1+r)*(1+e)-1<= 50%, where r is the return on the current day and e the return on the previous day, and at least either r or e being greater than 100%.
The data frame DF looks like that.
Date A B C D
01.01.2015 0.15 0.17 0.70 0.65
02.01.2015 1.01 0.75 0.01 -0.18
01.02.2015 -0.50 0.64 1.20 0.1
06.02.2015 0.12 0.54 0.13 1.50
01.03.2016 0.45 0.54 1.89 0.56
Afte apllying this filter DF should look like this.
Date A B C D
01.01.2015 0.15 0.17 0.70 0.65
02.01.2015 1.01 0.75 0.01 -0.18
01.02.2015 NA 0.64 1.20 0.1
06.02.2015 0.12 0.54 0.13 1.50
01.03.2016 0.45 0.54 1.89 0.56
Thanks for your help!
I would try this:
library(tidyverse)
check_fn <- function(Z){
ifelse(((lag(Z, n=1) > 1 | Z >1) & ((1+lag(Z, n=1))*(1+Z) <= 1.5)), NA, Z)
}
Y <- X %>%
mutate_at(vars(2:5), check_fn)
Y[1, 2:5] <- X[1, 2:5]
Y
... which generates:
Date A B C D
1 01.01.2015 0.15 0.17 0.70 0.65
2 02.01.2015 1.01 0.75 0.01 -0.18
3 01.02.2015 NA 0.64 1.20 0.10
4 06.02.2015 0.12 0.54 0.13 1.50
5 01.03.2016 0.45 0.54 1.89 0.56
I hope it helps you.
There might for shure be a more elegang solution.
> M=matrix(c(0.15,0.17,0.70,0.65,1.01,0.75,0.01,-0.18,-0.50,0.64,1.20,0.1,0.12,0.54,0.13,1.50,0.45,0.54,1.89,0.56),nrow = 5, byrow = TRUE)
> M
[,1] [,2] [,3] [,4]
[1,] 0.15 0.17 0.70 0.65
[2,] 1.01 0.75 0.01 -0.18
[3,] -0.50 0.64 1.20 0.10
[4,] 0.12 0.54 0.13 1.50
[5,] 0.45 0.54 1.89 0.56
> ifelse(rbind(c(T,T,T,T), !(((M[2:5,]>1)|(M[1:4,]>1))&(((1+M[2:5,])*(1+M[1:4,])-1)<.5))), M, NA)
[,1] [,2] [,3] [,4]
[1,] 0.15 0.17 0.70 0.65
[2,] 1.01 0.75 0.01 -0.18
[3,] NA 0.64 1.20 0.10
[4,] 0.12 0.54 0.13 1.50
[5,] 0.45 0.54 1.89 0.56
Sorry, I misread some of your post. I have corrected it so it matches your expected output.
library(data.table)
setDT(dat)
dat = dat[ , lapply(.SD, relationship), .SDcols = c("A", "B", "C", "D")]
relationship = function(x){
return(ifelse(((1 + x)*(1 + shift(x)) - 1) < .5 & !is.na(shift(x)) & (x > 1 | shift(x) > 1), NA, x))
}
> dat[ , lapply(.SD, relationship), .SDcols = c("A", "B", "C", "D")]
A B C D
1: 0.15 0.17 0.70 0.65
2: 1.01 0.75 0.01 -0.18
3: NA 0.64 1.20 0.10
4: 0.12 0.54 0.13 1.50
5: 0.45 0.54 1.89 0.56
You can cbind the dates back onto the data.table
I should add incase there are many more columns that this needs to be done on and you don't want to write them all out you could do something like this.
Dates = dat$Date
dat[ , "Date" := NULL]
dat = dat[ , lapply(.SD, relationship)]
That will apply the function to every column in the data.table.

R :How to execute FOR loop for Kmeans

I have an input file with Format as below :
RN KEY MET1 MET2 MET3 MET4
1 1 0.11 0.41 0.91 0.17
2 1 0.94 0.02 0.17 0.84
3 1 0.56 0.64 0.46 0.7
4 1 0.57 0.23 0.81 0.09
5 2 0.82 0.67 0.39 0.63
6 2 0.99 0.90 0.34 0.84
7 2 0.83 0.01 0.70 0.29
I have to execute Kmeans in R -separately for DF with Key=1 and Key=2 and so on...
Afterwards the final output CSV should look like
RN KEY MET1 MET2 MET3 MET4 CLST
1 1 0.11 0.41 0.91 0.17 1
2 1 0.94 0.02 0.17 0.84 1
3 1 0.56 0.64 0.46 0.77 2
4 1 0.57 0.23 0.81 0.09 2
5 2 0.82 0.67 0.39 0.63 1
6 2 0.99 0.90 0.34 0.84 2
7 2 0.83 0.01 0.70 0.29 2
Ie Key=1 is to be treated as separate DF and Key=2 is be treated as separate DF and so on...
Finally the output of clustering (of each DF)is to be combined with Key column first (since Key cannot participate in clustering) and then combined with each different DF for final output
In the above example :
DF1 is
KEY MET1 MET2 MET3 MET4
1 0.11 0.41 0.91 0.17
1 0.94 0.02 0.17 0.84
1 0.56 0.64 0.46 0.77
1 0.57 0.23 0.81 0.09
DF2 is
KEY MET1 MET2 MET3 MET4
2 0.82 0.67 0.39 0.63
2 0.99 0.90 0.34 0.84
2 0.83 0.01 0.70 0.29
Please suggest how to achieve in R
Psuedo code :
n<-Length(unique(Mydf$key))
for i=1 to n
{
#fetch partial df for each value of Key and run K means
dummydf<-subset(mydf,mydf$key=i
KmeansIns<-Kmeans(dummydf,2)
# combine with cluster result
dummydf<-data.frame(dummydf,KmeansIns$cluster)
# combine each smalldf into final Global DF
finaldf<-data.frame(finaldf,dummydf)
}Next i
#Now we have finaldf then it can be written to file
I think the easiest way would be to use by. Something along the lines of
by(data = DF, INDICES = DF$KEY, FUN = function(x) {
# your clustering code here
})
where x is a subset of your DF for each KEY.
A solution using data.tables.
library(data.table)
setDT(DF)[,CLST:=kmeans(.SD, centers=2)$clust, by=KEY, .SDcols=3:6]
DF
# RN KEY MET1 MET2 MET3 MET4 CLST
# 1: 1 1 0.11 0.41 0.91 0.17 2
# 2: 2 1 0.94 0.02 0.17 0.84 1
# 3: 3 1 0.56 0.64 0.46 0.70 1
# 4: 4 1 0.57 0.23 0.81 0.09 2
# 5: 5 2 0.82 0.67 0.39 0.63 2
# 6: 6 2 0.99 0.90 0.34 0.84 2
# 7: 7 2 0.83 0.01 0.70 0.29 1
#Read data
mdf <- read.table("mydat.txt", header=T)
#Convert to list based on KEY column
mls <- split(mdf, f=mdf$KEY)
#Define columns to use in clustering
myv <- paste0("MET", 1:4)
#Cluster each df item in list : modify kmeans() args as appropriate
kls <- lapply(X=mls, FUN=function(x){x$clust <- kmeans(x[, myv],
centers=2)$cluster ; return(x)})
#Make final "global" df
finaldf <- do.call(rbind, kls)

Resources