Related
I have a dataframe that has multiple columns (close to 100) I don't need that have "CNT" in the middle. Below is a short example:
id drink drink_CNT_v2 sage_CNT_v5
1 12 23 12
2 14 32 13
3 15 12 12
4 16 12 43
5 20 50 23
I want to remove all variables that have CNT in the middle. Does anyone know how I could do that. I tried using mutate in tidyverse, but that didn't work.
We could use contains in select
library(dplyr)
df2 <- df1 %>%
select(-contains("_CNT_"))
-output
df2
id drink
1 1 12
2 2 14
3 3 15
4 4 16
5 5 20
data
df1 <- structure(list(id = 1:5, drink = c(12L, 14L, 15L, 16L, 20L),
drink_CNT_v2 = c(23L, 32L, 12L, 12L, 50L), sage_CNT_v5 = c(12L,
13L, 12L, 43L, 23L)), class = "data.frame", row.names = c(NA,
-5L))
In base R, with grepl:
df[!grepl("CNT", colnames(df))]
Also works with select (use grep):
df %>%
select(-grep("CNT", names(.)))
I have a table with two columns A and B. I want to create a new table with two new columns added: X and Y. These two new columns are to contain data from column A, but every second row from column A. Correspondingly for column X, starting from the first value in column A and from the second value in column A for column Y.
So far, I have been doing it in Excel. But now I need it in R best function form so that I can easily reuse that code. I haven't done this in R yet, so I am asking for help.
Example data:
structure(list(A = c(2L, 7L, 5L, 11L, 54L, 12L, 34L, 14L, 10L,
6L), B = c(3L, 5L, 1L, 21L, 67L, 32L, 19L, 24L, 44L, 37L)), class = "data.frame", row.names = c(NA,
-10L))
Sample result:
structure(list(A = c(2L, 7L, 5L, 11L, 54L, 12L, 34L, 14L, 10L,
6L), B = c(3L, 5L, 1L, 21L, 67L, 32L, 19L, 24L, 44L, 37L), X = c(2L,
NA, 5L, NA, 54L, NA, 34L, NA, 10L, NA), Y = c(NA, 7L, NA, 11L,
NA, 12L, NA, 14L, NA, 6L)), class = "data.frame", row.names = c(NA,
-10L))
It is not a super elegant solution, but it works:
exampleDF <- structure(list(A = c(2L, 7L, 5L, 11L, 54L,
12L, 34L, 14L, 10L, 6L),
B = c(3L, 5L, 1L, 21L, 67L,
32L, 19L, 24L, 44L, 37L)),
class = "data.frame", row.names = c(NA, -10L))
index <- seq(from = 1, to = nrow(exampleDF), by = 2)
exampleDF$X <- NA
exampleDF$X[index] <- exampleDF$A[index]
exampleDF$Y <- exampleDF$A
exampleDF$Y[index] <- NA
You could also make use of the row numbers and the modulo operator:
A simple ifelse way:
library(dplyr)
df |>
mutate(X = ifelse(row_number() %% 2 == 1, A, NA),
Y = ifelse(row_number() %% 2 == 0, A, NA))
Or using pivoting:
library(dplyr)
library(tidyr)
df |>
mutate(name = ifelse(row_number() %% 2 == 1, "X", "Y"),
value = A) |>
pivot_wider()
A function using the first approach could look like:
See comment
xy_fun <- function(data, A = A, X = X, Y = Y) {
data |>
mutate({{X}} := ifelse(row_number() %% 2 == 1, {{A}}, NA),
{{Y}} := ifelse(row_number() %% 2 == 0, {{A}}, NA))
}
xy_fun(df, # Your data
A, # The col to take values from
X, # The column name of the first new column
Y # The column name of the second new column
)
Output:
A B X Y
1 2 3 2 NA
2 7 5 NA 7
3 5 1 5 NA
4 11 21 NA 11
5 54 67 54 NA
6 12 32 NA 12
7 34 19 34 NA
8 14 24 NA 14
9 10 44 10 NA
10 6 37 NA 6
Data stored as df:
df <- structure(list(A = c(2L, 7L, 5L, 11L, 54L, 12L, 34L, 14L, 10L, 6L),
B = c(3L, 5L, 1L, 21L, 67L, 32L, 19L, 24L, 44L, 37L)
),
class = "data.frame",
row.names = c(NA, -10L)
)
I like the #harre approach:
Another approach with base R we could ->
Use R's recycling ability (of a shorter-vector to a longer-vector):
df$X <- df$A
df$Y <- df$B
df$X[c(FALSE, TRUE)] <- NA
df$Y[c(TRUE, FALSE)] <- NA
df
A B X Y
1 2 3 2 NA
2 7 5 NA 5
3 5 1 5 NA
4 11 21 NA 21
5 54 67 54 NA
6 12 32 NA 32
7 34 19 34 NA
8 14 24 NA 24
9 10 44 10 NA
10 6 37 NA 37
I have following data that represents sequence of person's choice between four values (f1,f2,c1,c2) :
df=structure(list(combi = structure(c(24L, 8L, 3L, 19L, 4L, 23L,
15L, 12L, 14L, 22L, 5L, 13L, 18L, 9L, 2L, 25L, 11L, 7L, 21L,
10L, 6L, 17L, 20L, 16L), .Label = c("", "c1-c2-f1-f2", "c1-c2-f2-f1",
"c1-f1-c2-f2", "c1-f1-f2-c2", "c1-f2-c2-f1", "c1-f2-f1-c2", "c2-c1-f1-f2",
"c2-c1-f2-f1", "c2-f1-c1-f2", "c2-f1-f2-c1", "c2-f2-c1-f1", "c2-f2-f1-c1",
"f1-c1-c2-f2", "f1-c1-f2-c2", "f1-c2-c1-f2", "f1-c2-f2-c1", "f1-f2-c1-c2",
"f1-f2-c2-c1", "f2-c1-c2-f1", "f2-c1-f1-c2", "f2-c2-c1-f1", "f2-c2-f1-c1",
"f2-f1-c1-c2", "f2-f1-c2-c1"), class = "factor"), nb = c(10L,
0L, 2L, 4L, 1L, 5L, 1L, 2L, 1L, 3L, 1L, 0L, 3L, 5L, 0L, 18L,
5L, 2L, 5L, 0L, 4L, 4L, 11L, 2L)), .Names = c("combi", "nb"), class = "data.frame", row.names = c(1L,
3L, 5L, 7L, 9L, 11L, 13L, 15L, 17L, 19L, 21L, 23L, 25L, 27L,
29L, 31L, 33L, 35L, 37L, 39L, 41L, 43L, 45L, 47L))
I'm wondering if there's tree representation (or else) that could quantifiy, for each step choices number, by taking in account sub chain that are commun. Example :
f2 (52) -f1 (28) -c1-c2 (10)
-c2-c1 (18)
f2(52) there is 52 times chains begining by f2. there is 28 times chain beginning by f2-f1.
Thanks a lot.
If you read the combi values in (using as.character) you can expand those values to character columns:
df2 <- cbind(df, read.table(text=as.character(df$combi), sep="-",stringsAsFactors=FALSE) )
Then you can tabulate at whatever level you want:
xtabs(nb~V1, data=df2) # First level only
#V1
#c1 c2 f1 f2
#10 12 15 52
xtabs(nb~paste(V1,V2,sep="-"), data=df2) # first and second
#--
# paste(V1, V2, sep = "-")
#c1-c2 c1-f1 c1-f2 c2-c1 c2-f1 c2-f2 f1-c1 f1-c2 f1-f2 f2-c1 f2-c2 f2-f1
# 2 2 6 5 5 2 2 6 7 16 8 28
You can also deploy the addmargins function to compactly the display the two "most senior" position sub-totals:
addmargins( xtabs(nb~V1+V2, data=df2))
#=========
V2
V1 c1 c2 f1 f2 Sum
c1 0 2 2 6 10
c2 5 0 5 2 12
f1 2 6 0 7 15
f2 16 8 28 0 52
Sum 23 16 35 15 89
This could be "flattened" with ftable:
ftable( addmargins( xtabs(nb~V1+V2, data=df2)), row.vars=1:2)
V1 V2
c1 c1 0
c2 2
f1 2
f2 6
Sum 10
c2 c1 5
c2 0
f1 5
f2 2
Sum 12
f1 c1 2
c2 6
f1 0
f2 7
Sum 15
f2 c1 16
c2 8
f1 28
f2 0
Sum 52
Sum c1 23
c2 16
f1 35
f2 15
Sum 89
And the final tally would be:
xtabs(nb~paste(V1,V2,V3,V4,sep="-"), data=df2)
#-----
paste(V1, V2, V3, V4, sep = "-")
c1-c2-f1-f2 c1-c2-f2-f1 c1-f1-c2-f2 c1-f1-f2-c2 c1-f2-c2-f1 c1-f2-f1-c2 c2-c1-f1-f2 c2-c1-f2-f1
0 2 1 1 4 2 0 5
c2-f1-c1-f2 c2-f1-f2-c1 c2-f2-c1-f1 c2-f2-f1-c1 f1-c1-c2-f2 f1-c1-f2-c2 f1-c2-c1-f2 f1-c2-f2-c1
0 5 2 0 1 1 2 4
f1-f2-c1-c2 f1-f2-c2-c1 f2-c1-c2-f1 f2-c1-f1-c2 f2-c2-c1-f1 f2-c2-f1-c1 f2-f1-c1-c2 f2-f1-c2-c1
3 4 11 5 3 5 10 18
To see it all in a column:
as.matrix( xtabs(nb~paste(V1,V2,V3,V4,sep="-"), data=df2) )
#----------------
[,1]
c1-c2-f1-f2 0
c1-c2-f2-f1 2
c1-f1-c2-f2 1
c1-f1-f2-c2 1
c1-f2-c2-f1 4
c1-f2-f1-c2 2
c2-c1-f1-f2 0
c2-c1-f2-f1 5
c2-f1-c1-f2 0
c2-f1-f2-c1 5
c2-f2-c1-f1 2
c2-f2-f1-c1 0
f1-c1-c2-f2 1
f1-c1-f2-c2 1
f1-c2-c1-f2 2
f1-c2-f2-c1 4
f1-f2-c1-c2 3
f1-f2-c2-c1 4
f2-c1-c2-f1 11
f2-c1-f1-c2 5
f2-c2-c1-f1 3
f2-c2-f1-c1 5
f2-f1-c1-c2 10
f2-f1-c2-c1 18
I suppose a "final answer with all the subtotals might be:
ftable( addmargins( xtabs(nb~V1+V2+paste(V3,V4,sep="-"), data=df2)), row.vars=1:3)
However, that has so many zero entries that I hesitate to recommend. You could strip out zero rows:
my.ftable <- ftable( addmargins( xtabs(nb~V1+V2+paste(V3,V4,sep="-"), data=df2)), row.vars=1:3)
my.df.table <- as.data.frame(my.ftable)
names(my.df.table)[3] <- "3rd_4th"
my.df.table[ my.df.table$Freq > 0, ]
#---------
V1 V2 3rd_4th Freq
14 f2 f1 c1-c2 10
15 Sum f1 c1-c2 10
18 f1 f2 c1-c2 3
20 Sum f2 c1-c2 3
23 f1 Sum c1-c2 3
24 f2 Sum c1-c2 10
25 Sum Sum c1-c2 13
34 f2 c2 c1-f1 3
35 Sum c2 c1-f1 3
42 c2 f2 c1-f1 2
45 Sum f2 c1-f1 2
47 c2 Sum c1-f1 2
49 f2 Sum c1-f1 3
50 Sum Sum c1-f1 5
# and many more rows
#... until
321 c1 Sum Sum 10
322 c2 Sum Sum 12
323 f1 Sum Sum 15
324 f2 Sum Sum 52
325 Sum Sum Sum 89
The data.tree package specialises in tree representation. It is based on splitting variables in a hierarchal order, for example world -> continent -> country -> city. In your case, you've mentioned every order for c1, c2, f1 and f2. Likely you'd need to do four tree plots e.g. c1 --> either c2, f1 or f2, each leading to the two unused values, and then plot them.
A basic example starting with c1, and then splitting off, and not including specific values:
library(data.tree)
c1 <- Node$new("c1") # 1st level chain, "c1"
c2 <- c1$AddChild("c2") # new 2nd level chain, "c2", off c1
f1 <- c2$AddChild("f1-f2") # new level off c2
f2 <- c2$AddChild("f2-f1") # new level off c2
f1 <- c1$AddChild("f1") # new 2nd level chain, "f1", off c1
c2 <- f1$AddChild("c2-f2") # new level off f1
f2 <- f1$AddChild("f2-c2") # new level off f1
f2 <- c1$AddChild("f2") # new 2nd level chain, "f2", off c1
c2 <- f2$AddChild("c2-f1") # new level off f2
f1 <- f2$AddChild("f1-c2") # new level off f2
print(c1)
levelName
1 c1
2 ¦--c2
3 ¦ ¦--f1-f2
4 ¦ °--f2-f1
5 ¦--f1
6 ¦ ¦--c2-f2
7 ¦ °--f2-c2
8 °--f2
9 ¦--c2-f1
10 °--f1-c2
plot(c1)
Maybe not exactly what you mean by a "tree structure" but this gives you the numbers
in a table using base R. It should be easy to format that as you like from this result.
df=structure(list(combi = structure(c(24L, 8L, 3L, 19L, 4L, 23L,
15L, 12L, 14L, 22L, 5L, 13L, 18L, 9L, 2L, 25L, 11L, 7L, 21L,
10L, 6L, 17L, 20L, 16L), .Label = c("", "c1-c2-f1-f2", "c1-c2-f2-f1",
"c1-f1-c2-f2", "c1-f1-f2-c2", "c1-f2-c2-f1", "c1-f2-f1-c2", "c2-c1-f1-f2",
"c2-c1-f2-f1", "c2-f1-c1-f2", "c2-f1-f2-c1", "c2-f2-c1-f1", "c2-f2-f1-c1",
"f1-c1-c2-f2", "f1-c1-f2-c2", "f1-c2-c1-f2", "f1-c2-f2-c1", "f1-f2-c1-c2",
"f1-f2-c2-c1", "f2-c1-c2-f1", "f2-c1-f1-c2", "f2-c2-c1-f1", "f2-c2-f1-c1",
"f2-f1-c1-c2", "f2-f1-c2-c1"), class = "factor"), nb = c(10L,
0L, 2L, 4L, 1L, 5L, 1L, 2L, 1L, 3L, 1L, 0L, 3L, 5L, 0L, 18L,
5L, 2L, 5L, 0L, 4L, 4L, 11L, 2L)), .Names = c("combi", "nb"), class = "data.frame", row.names = c(1L,
3L, 5L, 7L, 9L, 11L, 13L, 15L, 17L, 19L, 21L, 23L, 25L, 27L,
29L, 31L, 33L, 35L, 37L, 39L, 41L, 43L, 45L, 47L))
tmp <- sapply(as.character(df$combi), strsplit, split = "-")
tmp <- do.call(rbind, tmp)
colnames(tmp) <- paste0("str", 1:4)
rownames(tmp) <- NULL
tmp <- data.frame(df, tmp)
tmp$str3 <- paste(tmp$str3, tmp$str4, sep = "-")
str1 <- aggregate(list(nb_str1 = tmp[,"nb"]), tmp["str1"], sum)
str2 <- aggregate(list(nb_str2 = tmp[,"nb"]), tmp[c("str1", "str2")], sum)
str3 <- aggregate(list(nb_str3 = tmp[,"nb"]), tmp[c("str1", "str2", "str3")], sum)
tmp <- merge(str3, str1)
tmp <- merge(tmp, str2)
tmp <- tmp[, c("str1", "nb_str1", "str2", "nb_str2", "str3", "nb_str3")]
tmp
#> str1 nb_str1 str2 nb_str2 str3 nb_str3
#> 1 c1 10 c2 2 f1-f2 0
#> 2 c1 10 c2 2 f2-f1 2
#> 3 c1 10 f1 2 c2-f2 1
#> 4 c1 10 f1 2 f2-c2 1
#> 5 c1 10 f2 6 c2-f1 4
#> 6 c1 10 f2 6 f1-c2 2
#> 7 c2 12 c1 5 f1-f2 0
#> 8 c2 12 c1 5 f2-f1 5
#> 9 c2 12 f1 5 c1-f2 0
#> 10 c2 12 f1 5 f2-c1 5
#> 11 c2 12 f2 2 c1-f1 2
#> 12 c2 12 f2 2 f1-c1 0
#> 13 f1 15 c1 2 c2-f2 1
#> 14 f1 15 c1 2 f2-c2 1
#> 15 f1 15 c2 6 c1-f2 2
#> 16 f1 15 c2 6 f2-c1 4
#> 17 f1 15 f2 7 c1-c2 3
#> 18 f1 15 f2 7 c2-c1 4
#> 19 f2 52 c1 16 c2-f1 11
#> 20 f2 52 c1 16 f1-c2 5
#> 21 f2 52 c2 8 c1-f1 3
#> 22 f2 52 c2 8 f1-c1 5
#> 23 f2 52 f1 28 c1-c2 10
#> 24 f2 52 f1 28 c2-c1 18
Created on 2018-03-15 by the reprex package (v0.2.0).
I am working on a dataframe and trying to find the index of nth maximum value (n varies by a loop), however, in the columns I have tied values and the program throws an error. Below is a sample dataset. I am basically trying to generate a similar dataframe, but with only the index values of all the values in the column vector of the dataframe.
For the output DF, column 1 in the output DF will have index values of elements of Refer_1, so Output_DF[1,1] will have the index for highest value, while Output_DF[10,1] will have the index of lowest value. Below is the input DF.
Input
1 17
2 21
3 13
4 26
5 204
6 36
7 14
8 25
9 45
10 37
Output (index values)
5
9
10
6
4
8
2
1
7
3
I am currently using which, unlist and partial together to get the indexes, however, I am unable to rectify the error. Note that the ties can occur with any nth maximum value (not necessarily the column maxima).
which(Consolidated_data_new[,i]==unlist(sort(Consolidated_data_new[,i],partial=j)[j]))
Please note that I want the code to return only one value at a time, and handle the 2nd tied value in the next loop iteration.
Please help solve this.
Regards,
library(data.table)
DT<-structure(list(Refer_1 = c(11L, 15L, 7L, 19L, 104L, 24L, 11L,
22L, 39L, 19L), Refer_2 = c(17L, 21L, 13L, 25L, 204L, 36L, 14L,
25L, 45L, 37L)), .Names = c("Refer_1", "Refer_2"), row.names = c(NA,
-10L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x0000000000130788>)
DT[,lapply(.SD, order,decreasing=TRUE)]
Refer_1 Refer_2
1: 5 5
2: 9 9
3: 6 10
4: 8 6
5: 4 4
6: 10 8
7: 2 2
8: 1 1
9: 7 7
10: 3 3
Your comments suggest you are working with a dataframe that has more than one column and that you want an output dataframe that has the results of order with decreasing=TRUE applied to every column:
> DF[2] <- sample(1:300, 10)
> DF[3] <- sample(1:300, 10)
> DF
Input V2 V3
1 17 210 3
2 21 72 4
3 13 263 1
4 26 249 6
5 204 223 10
6 36 83 7
7 14 107 2
8 25 295 5
9 45 198 9
10 37 112 8
> ordDF <- as.data.frame(lapply(DF, order, decreasing=TRUE))
> names(ordDF) <- paste0("res", 1:length(DF) )
> ordDF
res1 res2 res3
1 5 8 4
2 9 3 9
3 10 4 2
4 6 5 7
5 4 1 10
6 8 9 8
7 2 10 1
8 1 7 6
9 7 6 3
10 3 2 5
> dput(ordDF)
structure(list(res1 = c(5L, 9L, 10L, 6L, 4L, 8L, 2L, 1L, 7L,
3L), res2 = c(8L, 3L, 4L, 5L, 1L, 9L, 10L, 7L, 6L, 2L), res3 = c(4L,
9L, 2L, 7L, 10L, 8L, 1L, 6L, 3L, 5L)), .Names = c("res1", "res2",
"res3"), row.names = c(NA, -10L), class = "data.frame")
Using the example dataframe:
count.bouts <-structure(list(time.stamp = structure(c(1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L,
20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 28L, 29L, 30L, 31L,
32L, 33L, 34L, 35L, 36L, 37L), .Label = c("13:00:00", "13:00:10",
"13:00:20", "13:00:30", "13:00:40", "13:00:50", "13:01:00", "13:01:10",
"13:01:20", "13:01:30", "13:01:40", "13:01:50", "13:02:00", "13:02:10",
"13:02:20", "13:02:30", "13:02:40", "13:02:50", "13:03:00", "13:03:10",
"13:03:20", "13:03:30", "13:03:40", "13:03:50", "13:04:00", "13:04:10",
"13:04:20", "13:04:30", "13:04:40", "13:04:50", "13:05:00", "13:05:10",
"13:05:20", "13:05:30", "13:05:40", "13:05:50", "13:06:00"), class = "factor"),
count = c(5L, 11L, 16L, 19L, 15L, 11L, 8L, 5L, 2L, 6L, 12L,
15L, 20L, 12L, 6L, 2L, 18L, 25L, 26L, 15L, 13L, 6L, 5L, 4L,
8L, 9L, 16L, 26L, 29L, 55L, 21L, 6L, 9L, 28L, 16L, 19L, 26L,
5L)), .Names = c("time.stamp", "count"), class = "data.frame", row.names = c(NA,
-38L))
I wish to create a function that would identify bouts of high count activity that fulfils the following criteria:
Count data that is greater or equal to 10 for 1 minute or more
Within this period (or bout) of high counts, I would allow count data to drop to under 10 for a maximum of 20 seconds (within the bout)
Data that fulfils this criteria I would wish to be highlighted in the dataset by adding an extra column (called "1min+.bouts") to the dataframe. Then each bout would be identified with a number starting from 1 - i.e. the dataframe described above would have a series of 1s for the first bout (13:01:40 to 13:03:20) and then 2s for the second bout (13:04:20 to 13:05:50). 0s would be added to those rows with no bouts.
I hope that makes sense. If anyone could possible point me in the right direction re. packages or functions that would help me out, I should be most grateful.
This assumes that there are no NA values:
#which counts are >= 10
tmp <- count.bouts$count >= 10
#substitute FALSE with NA, so we can use na.approx for interpolation
tmp[!tmp] <- NA
library(zoo)
#fill gaps of up to two values
tmp <- na.approx(tmp, method = "constant", maxgap = 2, na.rm = FALSE)
#NA --> 0
tmp[is.na(tmp)] <- 0
#run lengths
tmp <- rle(tmp)
#we don't want run lengths shorter one minute
tmp$values[tmp$lengths < 6] <- 0
#number the run lengths we are interested in
tmp$values <- cumsum(tmp$values) * tmp$values
#inverse run length encoding
count.bouts$bout <- inverse.rle(tmp)
# time.stamp count bout
#1 13:00:00 5 0
#2 13:00:10 11 0
#3 13:00:20 16 0
#4 13:00:30 19 0
#5 13:00:40 15 0
#6 13:00:50 11 0
#7 13:01:00 8 0
#8 13:01:10 5 0
#9 13:01:20 2 0
#10 13:01:30 6 0
#11 13:01:40 12 1
#12 13:01:50 15 1
#13 13:02:00 20 1
#14 13:02:10 12 1
#15 13:02:20 6 1
#16 13:02:30 2 1
#17 13:02:40 18 1
#18 13:02:50 25 1
#19 13:03:00 26 1
#20 13:03:10 15 1
#21 13:03:20 13 1
#22 13:03:30 6 0
#23 13:03:40 5 0
#24 13:03:50 4 0
#25 13:04:00 8 0
#26 13:04:10 9 0
#27 13:04:20 16 2
#28 13:04:30 26 2
#29 13:04:30 29 2
#30 13:04:40 55 2
#31 13:04:50 21 2
#32 13:05:00 6 2
#33 13:05:10 9 2
#34 13:05:20 28 2
#35 13:05:30 16 2
#36 13:05:40 19 2
#37 13:05:50 26 2
#38 13:06:00 5 0