Classification using custom ordination rule - r

In my example, I want to use the following code:
# Classifiction dataset
library(dplyr)
nest <- c(1,3,4,7,12,13,21,25,26,28)
finder_max <- c(9,50,25,50,25,50,9,9,9,3)
max_TA <- c(7.4,29.4,17.0,33.1,16.2,34.4,4.3,3.52,7.47,1.4)
ds.class <- data.frame(nest,finder_max,max_TA)
ds.class$ClassType <- ifelse(ds.class$finder_max==3,"Class_1_3",
ifelse(ds.class$finder_max==9,"Class_3_9",
ifelse(ds.class$finder_max==25,"Class_9_25",
ifelse(ds.class$finder_max==50,"Class_25_50","Class_50_51"))))
ds.class
# nest finder_max max_TA ClassType
# 1 1 9 7.40 Class_3_9
# 2 3 50 29.40 Class_25_50
# 3 4 25 17.00 Class_9_25
# 4 7 50 33.10 Class_25_50
# 5 12 25 16.20 Class_9_25
# 6 13 50 34.40 Class_25_50
# 7 21 9 4.30 Class_3_9
# 8 25 9 3.52 Class
# 9 26 9 7.47 Class_3_9
# 10 28 3 1.40 Class_1_3
# Custom ordination vector
custom.vec <- c("Class_0_1","Class_1_3","Class_3_9",
"Class_9_25","Class_25_50","Class_50")
# Original dataset
my.ds <- read.csv("https://raw.githubusercontent.com/Leprechault/trash/main/test_ants.csv")
my.ds$ClassType <- cut(my.ds$AT,breaks=c(-Inf,1,2.9,8.9,24.9,49.9,Inf),
right=FALSE,labels=c("Class_0_1","Class_1_3","Class_3_9",
"Class_9_25","Class_25_50","Class_50"))
str(my.ds)
# 'data.frame': 55 obs. of 4 variables:
# $ days : int 0 47 76 0 47 76 118 160 193 227 ...
# $ nest : int 2 2 2 3 3 3 3 3 3 3 ...
# $ AT : num 10.92 22.86 23.24 0.14 0.48 ...
# $ ClassType: Factor w/ 6 levels "Class_0_1","Class_1_3",..: 4 4 4 1 1 1 1 1 1 1 ...
I'd like to remove the rows in the my.ds with equal ClassType find in ds.class by nest. I need to remove too, the classes
higher in my custom ordination than ClassType (custom.vec). Example: If I have ClassType Class_25_50 in nest 3 in ds.class, I need to remove the data with this ClassType and higher classes ("Class_50"), if exist, for nest 3 in the file my.ds
My new output must to be for new.my.ds:
new.my.ds
# days nest AT ClassType
# 1 0 2 10.9200 Class_9_25
# 2 47 2 22.8600 Class_9_25
# 3 76 2 23.2400 Class_9_25
# 4 0 3 0.1400 Class_0_1
# 5 47 3 0.4800 Class_0_1
# 6 76 3 0.8300 Class_0_1
# 7 118 3 0.8300 Class_0_1
# 8 160 3 0.9400 Class_0_1
# 9 193 3 0.9400 Class_0_1
# 10 227 3 0.9400 Class_0_1
# 11 262 3 0.9400 Class_0_1
# 12 306 3 0.9400 Class_0_1
# 13 355 3 11.9300 Class_9_25
# 14 396 3 12.8100 Class_9_25
# 16 0 4 1.0000 Class_1_3
# 17 76 4 1.5600 Class_1_3
# 18 160 4 2.8800 Class_1_3
# 19 193 4 2.8800 Class_1_3
# 20 227 4 2.8800 Class_1_3
# 21 262 4 2.8800 Class_1_3
# 22 306 4 2.8800 Class_1_3
# 24 0 7 11.7100 Class_9_25
# 25 47 7 24.7900 Class_9_25
#...
# 55 349 1067 0.9600 Class_0_1
Please, any help with it?

Related

R- Subtract values of all rows in a group from previous row in different group and filter out rows

In R say I had the data frame:
data
frame object x y
1 6 150 100
2 6 149 99
3 6 148 98
3 6 140 90
4 6 148.5 97
4 6 142 93
5 6 147 96
5 6 138 92
5 6 135 90
6 6 146.5 99
1 7 125 200
2 7 126 197
3 7 127 202
3 7 119 185
4 7 117 183
4 7 123 199
5 7 115 190
5 7 124 202
5 7 118 192
6 7 124.5 199
I want to output the object which is the closest in the previous frame based on the (x,y) coordinates and filter out the other objects. I want to find the difference in the x and y between all the objects in a given frame and the single object in the previous frame and keep the closest object while removing the rest. The object that is kept would then serve as reference for the next frame. The frames with only one object would be left as is. The output should be one object per frame:
data
frame object x y
1 6 150 100
2 6 149 99
3 6 148 98
4 6 148.5 97
5 6 147 96
6 6 146.5 99
1 7 125 200
2 7 126 197
3 7 127 202
4 7 123 199
5 7 124 202
6 7 124.5 199
This is a cumulative operation, so it'll take an iterative approach. Here's a simple function to do one operation, assuming it's for only one object.
fun <- function(Z, fr) {
prevZ <- head(subset(Z, frame == (fr-1)), 1)
thisZ <- subset(Z, frame == fr)
if (nrow(prevZ) < 1 || nrow(thisZ) < 2) return(Z)
ind <- which.min( abs(thisZ$x - prevZ$x) + abs(thisZ$y - prevZ$y) )
rbind(subset(Z, frame != fr), thisZ[ind,])
}
fun(subset(dat, object == 6), 3)
# frame object x y
# 1 1 6 150.0 100
# 2 2 6 149.0 99
# 5 4 6 148.5 97
# 6 4 6 142.0 93
# 7 5 6 147.0 96
# 8 5 6 138.0 92
# 9 5 6 135.0 90
# 10 6 6 146.5 99
# 3 3 6 148.0 98
(The order is not maintained, it can easily be sorted back into place as needed.)
Now we can Reduce this for each object within the data.
out <- do.call(rbind,
lapply(split(dat, dat$object),
function(X) Reduce(fun, seq(min(X$frame)+1, max(X$frame)), init=X)))
out <- out[order(out$object, out$frame),]
out
# frame object x y
# 6.1 1 6 150.0 100
# 6.2 2 6 149.0 99
# 6.3 3 6 148.0 98
# 6.5 4 6 148.5 97
# 6.7 5 6 147.0 96
# 6.10 6 6 146.5 99
# 7.11 1 7 125.0 200
# 7.12 2 7 126.0 197
# 7.13 3 7 127.0 202
# 7.16 4 7 123.0 199
# 7.18 5 7 124.0 202
# 7.20 6 7 124.5 199
We can create a for loop that applies the criteria to a single object, and then use group_by %>% summarize to apply it to every object:
library(dplyr)
keep_closest_frame = function(data) {
frames = split(data, dd$frame)
for(i in seq_along(frames)) {
if(nrow(frames[[i]]) != 1 & i == 1) {
stop("First frame must have exactly 1 row")
}
if(nrow(frames[[i]]) == 1) next
dists = with(frames[[i]], abs(x - frames[[i - 1]][["x"]]) + abs(y - frames[[i - 1]][["y"]]))
frames[[i]] = frames[[i]][which.min(dists), ]
}
bind_rows(frames)
}
data %>%
group_by(object) %>%
summarize(keep_closest_frame(across()))
# # A tibble: 12 × 4
# # Groups: object [2]
# object frame x y
# <int> <int> <dbl> <int>
# 1 6 1 150 100
# 2 6 2 149 99
# 3 6 3 148 98
# 4 6 4 148. 97
# 5 6 5 147 96
# 6 6 6 146. 99
# 7 7 1 125 200
# 8 7 2 126 197
# 9 7 3 127 202
# 10 7 4 123 199
# 11 7 5 124 202
# 12 7 6 124. 199

Slide along data frame rows and compare rows with next rows

I guess something similar should have been asked before, however I could only find an answer for python and SQL. So please notify me in the comments when this was also asked for R!
Data
Let's say we have a dataframe like this:
set.seed(1); df <- data.frame( position = 1:20,value = sample(seq(1,100), 20))
# In cause you do not get the same dataframe see the comment by #Ian Campbell - thanks!
position value
1 1 27
2 2 37
3 3 57
4 4 89
5 5 20
6 6 86
7 7 97
8 8 62
9 9 58
10 10 6
11 11 19
12 12 16
13 13 61
14 14 34
15 15 67
16 16 43
17 17 88
18 18 83
19 19 32
20 20 63
Goal
I'm interested in calculating the average value for n positions and subtract this from the average value of the next n positions, let's say n=5 for now.
What I tried
I now used this method, however when I apply this to a bigger dataframe it takes a huge amount of time, and hence wonder if there is a faster method for this.
calc <- function( pos ) {
this.five <- df %>% slice(pos:(pos+4))
next.five <- df %>% slice((pos+5):(pos+9))
differ = mean(this.five$value)- mean(next.five$value)
data.frame(dif= differ)
}
df %>%
group_by(position) %>%
do(calc(.$position))
That produces the following table:
position dif
<int> <dbl>
1 1 -15.8
2 2 9.40
3 3 37.6
4 4 38.8
5 5 37.4
6 6 22.4
7 7 4.20
8 8 -26.4
9 9 -31
10 10 -35.4
11 11 -22.4
12 12 -22.3
13 13 -0.733
14 14 15.5
15 15 -0.400
16 16 NaN
17 17 NaN
18 18 NaN
19 19 NaN
20 20 NaN
I suspect a data.table approach may be faster.
library(data.table)
setDT(df)
df[,c("roll.position","rollmean") := lapply(.SD,frollmean,n=5,fill=NA, align = "left")]
df[, result := rollmean[.I] - rollmean[.I + 5]]
df[,.(position,value,rollmean,result)]
# position value rollmean result
# 1: 1 27 46.0 -15.8
# 2: 2 37 57.8 9.4
# 3: 3 57 69.8 37.6
# 4: 4 89 70.8 38.8
# 5: 5 20 64.6 37.4
# 6: 6 86 61.8 22.4
# 7: 7 97 48.4 4.2
# 8: 8 62 32.2 -26.4
# 9: 9 58 32.0 -31.0
#10: 10 6 27.2 -35.4
#11: 11 19 39.4 -22.4
#12: 12 16 44.2 NA
#13: 13 61 58.6 NA
#14: 14 34 63.0 NA
#15: 15 67 62.6 NA
#16: 16 43 61.8 NA
#17: 17 88 NA NA
#18: 18 83 NA NA
#19: 19 32 NA NA
#20: 20 63 NA NA
Data
RNGkind(sample.kind = "Rounding")
set.seed(1); df <- data.frame( position = 1:20,value = sample(seq(1,100), 20))
RNGkind(sample.kind = "default")

R: Manipulating dataframes

Df_01a
Name re1 re2 re3 parameter
a 144 39.7 0.012 fed
b 223 31.2 5 fed
c 304 6.53 100 fed
d 187 51.3 25 fed
e 110 2.94 100 fed
f 151 4.23 75 fed
g 127 36.7 0.012 fed
Df_01b
Name re1 re2 re3 parameter
a 142 39.3 0.042 feh
b 221 31.0 4 feh
c 301 6.13 90 feh
d 185 41.3 15 feh
e 107 2.44 940 feh
f 143 2.23 75 feh
g 121 31.7 0.012 feh
Df_02
parameter c1 c2 c3
1 fed 5 4 3
2 feh 3 4 2
3 fea 5 4 3
4 few 2 4 3
Desired result:
c-value re-value name
5 142 a_fed
4 39.3 a_fed
3 0.042 a_fed
5 221 b_fed
4 31.0 b_fed
3 4 b_fed
5 304 c_fed
4 6.53 c_fed
3 100 c_fed
....
3 0.012 g_fed
3 142 a_feh
4 39.3 a_feh
2 0.042 a_feh
3 221 b_feh
4 31.0 b_feh
2 4 b_feh
....
I have Df_01a, Df_01b, Df_01c, Df_01d. These have a parameter in
column 5: fed, feh, fea, few, respectively (See Df_02).
Each parameter has 3 values, given by c1, c2 and c3 in Df_02.
How can I get the desired data.frame shown above?
code
library(dplyr)
library(tidyr)
rbind(Df_01a,Df_01b) %>% gather("re-col","re-value",c("re1","re2","re3")) %>%
inner_join(Df_02 %>% rename(re1=c1,re2=c2,re3=c3) %>% gather("re-col","c-value",c("re1","re2","re3"))) %>%
arrange(parameter,Name) %>%
unite(name,Name,parameter) %>%
select(`c-value`,`re-value`,`name`)
result
# c-value re-value name
# 1 5 144.000 a_fed
# 2 4 39.700 a_fed
# 3 3 0.012 a_fed
# 4 5 223.000 b_fed
# 5 4 31.200 b_fed
# 6 3 5.000 b_fed
# 7 5 304.000 c_fed
# 8 4 6.530 c_fed
# 9 3 100.000 c_fed
# 10 5 187.000 d_fed
# 11 4 51.300 d_fed
# 12 3 25.000 d_fed
# 13 5 110.000 e_fed
# 14 4 2.940 e_fed
# 15 3 100.000 e_fed
# 16 5 151.000 f_fed
# 17 4 4.230 f_fed
# 18 3 75.000 f_fed
# 19 5 127.000 g_fed
# 20 4 36.700 g_fed
# 21 3 0.012 g_fed
# 22 3 142.000 a_feh
# 23 4 39.300 a_feh
# 24 2 0.042 a_feh
# 25 3 221.000 b_feh
# 26 4 31.000 b_feh
# 27 2 4.000 b_feh
# 28 3 301.000 c_feh
# 29 4 6.130 c_feh
# 30 2 90.000 c_feh
# 31 3 185.000 d_feh
# 32 4 41.300 d_feh
# 33 2 15.000 d_feh
# 34 3 107.000 e_feh
# 35 4 2.440 e_feh
# 36 2 940.000 e_feh
# 37 3 143.000 f_feh
# 38 4 2.230 f_feh
# 39 2 75.000 f_feh
# 40 3 121.000 g_feh
# 41 4 31.700 g_feh
# 42 2 0.012 g_feh
data
Df_01a <- read.table(text="Name re1 re2 re3 parameter
a 144 39.7 0.012 fed
b 223 31.2 5 fed
c 304 6.53 100 fed
d 187 51.3 25 fed
e 110 2.94 100 fed
f 151 4.23 75 fed
g 127 36.7 0.012 fed",header=T,stringsAsFactors=F)
Df_01b <- read.table(text="Name re1 re2 re3 parameter
a 142 39.3 0.042 feh
b 221 31.0 4 feh
c 301 6.13 90 feh
d 185 41.3 15 feh
e 107 2.44 940 feh
f 143 2.23 75 feh
g 121 31.7 0.012 feh",header=T,stringsAsFactors=F)
Df_02 <- read.table(text="parameter c1 c2 c3
1 fed 5 4 3
2 feh 3 4 2
3 fea 5 4 3
4 few 2 4 3",header=T,stringsAsFactors=F)

How to flatten out nested list into one list more efficiently instead of using unlist method?

I have a nested list which contains set of data.frame objects in it, now I want them flatten out. I used most common approach like unlist method, it is not properly fatten out my list, the output was not well represented. How can I make this happen more efficiently? Does anyone knows any trick of doing this operation? Thanks.
example:
mylist <- list(pass=list(Alpha.df1_yes=airquality[2:4,], Alpha.df2_yes=airquality[3:6,],Alpha.df3_yes=airquality[2:5,],Alpha.df4_yes=airquality[7:9,]),
fail=list(Alpha.df1_no=airquality[5:7,], Alpha.df2_no=airquality[8:10,], Alpha.df3_no=airquality[13:16,],Alpha.df4_no=airquality[11:13,]))
I tried like this, it works but output was not properly arranged.
res <- lapply(mylist, unlist)
after flatten out, I would like to do merge them without duplication:
out <- lapply(res, rbind.data.frame)
my desired output:
mylist[[1]]$pass:
Ozone Solar.R Wind Temp Month Day
2 36 118 8.0 72 5 2
3 12 149 12.6 74 5 3
4 18 313 11.5 62 5 4
How can make this sort of flatten output more compatibly represented? Can anyone propose possible idea of doing this in R? Thanks a lot.
Using lapply and duplicated:
res <- lapply(mylist, function(i){
x <- do.call(rbind, i)
x[ !duplicated(x), ]
rownames(x) <- NULL
x
})
res$pass
# Ozone Solar.R Wind Temp Month Day
# 1 36 118 8.0 72 5 2
# 2 12 149 12.6 74 5 3
# 3 18 313 11.5 62 5 4
# 4 12 149 12.6 74 5 3
# 5 18 313 11.5 62 5 4
# 6 NA NA 14.3 56 5 5
# 7 28 NA 14.9 66 5 6
# 8 36 118 8.0 72 5 2
# 9 12 149 12.6 74 5 3
# 10 18 313 11.5 62 5 4
# 11 NA NA 14.3 56 5 5
# 12 23 299 8.6 65 5 7
# 13 19 99 13.8 59 5 8
# 14 8 19 20.1 61 5 9
Above still returns a list, if we want to keep all in one dataframe with no lists, then:
res <- do.call(rbind, unlist(mylist, recursive = FALSE))
res <- res[!duplicated(res), ]
res
# Ozone Solar.R Wind Temp Month Day
# pass.Alpha.df1_yes.2 36 118 8.0 72 5 2
# pass.Alpha.df1_yes.3 12 149 12.6 74 5 3
# pass.Alpha.df1_yes.4 18 313 11.5 62 5 4
# pass.Alpha.df2_yes.5 NA NA 14.3 56 5 5
# pass.Alpha.df2_yes.6 28 NA 14.9 66 5 6
# pass.Alpha.df4_yes.7 23 299 8.6 65 5 7
# pass.Alpha.df4_yes.8 19 99 13.8 59 5 8
# pass.Alpha.df4_yes.9 8 19 20.1 61 5 9
# fail.Alpha.df2_no.10 NA 194 8.6 69 5 10
# fail.Alpha.df3_no.13 11 290 9.2 66 5 13
# fail.Alpha.df3_no.14 14 274 10.9 68 5 14
# fail.Alpha.df3_no.15 18 65 13.2 58 5 15
# fail.Alpha.df3_no.16 14 334 11.5 64 5 16
# fail.Alpha.df4_no.11 7 NA 6.9 74 5 11
# fail.Alpha.df4_no.12 16 256 9.7 69 5 12

Shift up rows in R

This is a simple example of how my data looks like.
Suppose I got the following data
>x
Year a b c
1962 1 2 3
1963 4 5 6
. . . .
. . . .
2001 7 8 9
I need to form a time series of x with 7 column contains the following variables:
Year a lag(a) b lag(b) c lag(c)
What I did is the following:
> x<-ts(x) # converting x to a time series
> x<-cbind(x,x[,-1]) # adding the same variables to the time series without repeating the year column
> x
Year a b c a b c
1962 1 2 3 1 2 3
1963 4 5 6 4 5 6
. . . . . . .
. . . . . . .
2001 7 8 9 7 8 9
I need to shift the last three column up so they give the lags of a,b,c. then I will rearrange them.
Here's an approach using dplyr
df <- data.frame(
a=1:10,
b=21:30,
c=31:40)
library(dplyr)
df %>% mutate_each(funs(lead(.,1))) %>% cbind(df, .)
# a b c a b c
#1 1 21 31 2 22 32
#2 2 22 32 3 23 33
#3 3 23 33 4 24 34
#4 4 24 34 5 25 35
#5 5 25 35 6 26 36
#6 6 26 36 7 27 37
#7 7 27 37 8 28 38
#8 8 28 38 9 29 39
#9 9 29 39 10 30 40
#10 10 30 40 NA NA NA
You can change the names afterwards using colnames(df) <- c("a", "b", ...)
As #nrussel noted in his answer, what you described is a leading variable. If you want a lagging variable, you can change the lead in my answer to lag.
X <- data.frame(
a=1:100,
b=2*(1:100),
c=3*(1:100),
laga=1:100,
lagb=2*(1:100),
lagc=3*(1:100),
stringsAsFactors=FALSE)
##
Xts <- ts(X)
Xts[1:(nrow(Xts)-1),c(4,5,6)] <- Xts[2:nrow(Xts),c(4,5,6)]
Xts[nrow(Xts),c(4,5,6)] <- c(NA,NA,NA)
> head(Xts)
a b c laga lagb lagc
[1,] 1 2 3 2 4 6
[2,] 2 4 6 3 6 9
[3,] 3 6 9 4 8 12
[4,] 4 8 12 5 10 15
[5,] 5 10 15 6 12 18
[6,] 6 12 18 7 14 21
##
> tail(Xts)
a b c laga lagb lagc
[95,] 95 190 285 96 192 288
[96,] 96 192 288 97 194 291
[97,] 97 194 291 98 196 294
[98,] 98 196 294 99 198 297
[99,] 99 198 297 100 200 300
[100,] 100 200 300 NA NA NA
I'm not sure if by shift up you literally mean shift the rows up 1 place like above (because that would mean you are using lagging values not leading values), but here's the other direction ("true" lagged values):
X2 <- data.frame(
a=1:100,
b=2*(1:100),
c=3*(1:100),
laga=1:100,
lagb=2*(1:100),
lagc=3*(1:100),
stringsAsFactors=FALSE)
##
Xts2 <- ts(X2)
Xts2[2:nrow(Xts2),c(4,5,6)] <- Xts2[1:(nrow(Xts2)-1),c(4,5,6)]
Xts2[1,c(4,5,6)] <- c(NA,NA,NA)
##
> head(Xts2)
a b c laga lagb lagc
[1,] 1 2 3 NA NA NA
[2,] 2 4 6 1 2 3
[3,] 3 6 9 2 4 6
[4,] 4 8 12 3 6 9
[5,] 5 10 15 4 8 12
[6,] 6 12 18 5 10 15
##
> tail(Xts2)
a b c laga lagb lagc
[95,] 95 190 285 94 188 282
[96,] 96 192 288 95 190 285
[97,] 97 194 291 96 192 288
[98,] 98 196 294 97 194 291
[99,] 99 198 297 98 196 294
[100,] 100 200 300 99 198 297

Resources