Non-overlapping sliding window based on index - r

For a data.frame, df, which has an index column and a value column, I would like to calculate e.g.the mean of the values in non-overlapping sliding windows, with the window size based on the units in the index column (for example, windows which cover 10 units in the index).
There is runner::runner and slider::slide_index which allow you to slide in windows based on an index column, but I don't see a way to make the windows non-overlapping.
df = structure(list(V3 = c(17054720L, 17075353L, 17087656L, 17099107L,
17152611L, 17154984L, 17178213L, 17256231L, 17264565L, 17280822L,
17281931L, 17285949L, 17289118L, 17294251L, 17301217L, 17301843L,
17304246L, 17304887L, 17306104L, 17310741L, 17312596L, 17315102L,
17315503L, 17317233L, 17318150L, 17319156L, 17326181L, 17326432L,
17394989L, 17395610L, 17396612L, 17397875L, 17398508L, 17398800L,
17398812L, 17399211L, 17405173L, 17407349L, 17407566L, 17409897L,
17410373L, 17412216L, 17412806L, 17414103L, 17414640L, 17415572L,
17426401L, 17427037L, 17429384L, 17429434L, 17433210L, 17434084L,
17436846L, 17441524L, 17442154L, 17443131L, 17445502L, 17446157L,
17446914L, 17450515L, 17452966L, 17462185L, 17467411L, 17467684L,
17470779L, 17475921L, 17488195L, 17489577L, 17489890L, 17490932L,
17492203L, 17492452L, 17493792L, 17494101L, 17494547L, 17524203L,
17525584L, 17525970L, 17529814L, 17541673L, 17545859L, 17557144L,
17567699L, 17575800L, 17580394L, 17580813L, 17585441L, 17586471L,
17587680L, 17587975L, 17589209L, 17589246L, 17593685L, 17594915L,
17597462L, 17599844L, 17603801L, 17605824L, 17611515L, 17615213L
), V1 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L,
0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L)), row.names = c(NA, -100L), class = "data.frame")

What about something like this:
df <- data.frame(
index = 0:99,
value = 1:100)
df %>%
mutate(window = floor(index/10)) %>%
group_by(window) %>%
summarise(value = mean(value),
n = n())
# # A tibble: 10 × 3
# window value n
# <dbl> <dbl> <int>
# 1 0 5.5 10
# 2 1 15.5 10
# 3 2 25.5 10
# 4 3 35.5 10
# 5 4 45.5 10
# 6 5 55.5 10
# 7 6 65.5 10
# 8 7 75.5 10
# 9 8 85.5 10
# 10 9 95.5 10
In the answer above, you divide the index by the window width and wrap that in the floor() function so it rounds all observations down to the nearest integer. This assumes that the index values are consecutive integers. An alternative, if the index is not sequential is to make it so, like what follows:
df <- data.frame(
index = sample(0:1000, 100, replace=FALSE),
value = 1:100)
df %>%
arrange(index) %>%
mutate(obs = seq_along(index)-1,
window = floor(obs/10)) %>%
group_by(window) %>%
summarise(value = mean(value),
n = n())
# A tibble: 10 × 3
# window value n
# <dbl> <dbl> <int>
# 1 0 38.2 10
# 2 1 50.1 10
# 3 2 63.6 10
# 4 3 64.9 10
# 5 4 44 10
# 6 5 41.5 10
# 7 6 65.4 10
# 8 7 45.1 10
# 9 8 48.9 10
# 10 9 43.3 10

Related

Frequency of Occurrence of values above a certain value across many columns?

I have a dataset with many columns and thousands of rows. I am trying to get another column, foo which has the frequency that a value above 100 occurs in a row.
structure(list(S026401.R1 = c(0L, 0L, 0L, 0L, 0L), S026404.R1 = c(0L,
0L, 0L, 0L, 0L), S026406.R1 = c(0L, 0L, 0L, 0L, 0L), S026409.R1 = c(0L,
0L, 0L, 0L, 0L), S026412.R1 = c(0L, 0L, 0L, 0L, 0L), S026413.R1 = c(0L,
0L, 0L, 0L, 0L), S026414.R1 = c(47L, 0L, 0L, 0L, 0L), S026415.R1 = c(0L,
0L, 0L, 0L, 0L), S026416.R1 = c(31L, 0L, 0L, 0L, 0L), S026419.R1 = c(0L,
0L, 0L, 0L, 0L), S026421.R1 = c(0L, 0L, 0L, 0L, 34L), S026422.R1 = c(0L,
0L, 0L, 0L, 0L), S026423.R1 = c(0L, 0L, 0L, 0L, 0L), S026427.R1 = c(0L,
0L, 0L, 0L, 0L), S026428.R1 = c(0L, 0L, 0L, 0L, 1049L), S026429.R1 = c(0L,
0L, 0L, 0L, 0L), S026430.R1 = c(0L, 0L, 0L, 0L, 0L), S026431.R1 = c(0L,
10L, 0L, 0L, 0L), S026432.R1 = c(0L, 0L, 0L, 0L, 0L), S026433.R1 = c(0L,
0L, 0L, 0L, 0L), S026434.R1 = c(0L, 0L, 0L, 0L, 0L), S026435.R1 = c(0L,
0L, 0L, 0L, 0L), S026438.R1 = c(0L, 0L, 0L, 0L, 0L), S026440.R1 = c(0L,
0L, 0L, 0L, 0L), S026444.R1 = c(0L, 0L, 0L, 0L, 0L), S026447.R1 = c(0L,
0L, 0L, 0L, 0L), S026450.R1 = c(0L, 0L, 0L, 0L, 0L), S026451.R1 = c(0L,
0L, 0L, 0L, 0L), S026453.R1 = c(0L, 0L, 53L, 0L, 0L), S026456.R1 = c(0L,
0L, 0L, 0L, 0L), S026457.R1 = c(0L, 0L, 0L, 0L, 0L), S026458.R1 = c(0L,
0L, 0L, 0L, 0L), S026461.R1 = c(0L, 0L, 0L, 0L, 0L), S026462.R1 = c(0L,
0L, 0L, 0L, 18L), S026463.R1 = c(153L, 0L, 0L, 0L, 0L), S026464.R1 = c(0L,
0L, 0L, 0L, 0L), S026466.R1 = c(0L, 0L, 0L, 0L, 0L), S026467.R1 = c(32L,
0L, 0L, 0L, 0L), S026469.R1 = c(0L, 0L, 0L, 0L, 0L), S026470.R1 = c(0L,
0L, 0L, 0L, 0L), S026471.R1 = c(0L, 0L, 0L, 0L, 0L), S026473.R1 = c(0L,
0L, 0L, 0L, 0L), S026474.R1 = c(0L, 0L, 0L, 0L, 0L), S026476.R1 = c(0L,
0L, 0L, 0L, 0L), S026477.R1 = c(780L, 0L, 0L, 0L, 0L), S026483.R1 = c(21L,
0L, 0L, 0L, 0L), S026484.R1 = c(0L, 0L, 0L, 0L, 0L), S026485.R1 = c(0L,
0L, 0L, 13L, 0L), S026488.R1 = c(0L, 0L, 0L, 0L, 0L), S026489.R1 = c(0L,
0L, 0L, 0L, 0L), S026490.R1 = c(60L, 0L, 0L, 0L, 0L), S026493.R1 = c(0L,
0L, 103L, 0L, 0L)), class = c("rowwise_df", "tbl_df", "tbl",
"data.frame"), row.names = c(NA, -5L), groups = structure(list(
.rows = structure(list(1L, 2L, 3L, 4L, 5L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -5L), class = c("tbl_df",
"tbl", "data.frame")))
what i've been trying is:
df %>%
rowwise() %>%
mutate(foo = sum(c(8:52>100), na.rm = TRUE))
but this returns all 0s in the new column foo
when i change >100 to a lower number, it does provide a frequency. however, there are many values above 100.
As suggested, use c_across with a range (or collection) of columns.
library(dplyr)
df %>%
rowwise() %>%
mutate(foo = sum(c_across(X1:X8) > 15)) %>%
ungroup()
# # A tibble: 3 x 9
# X1 X2 X3 X4 X5 X6 X7 X8 foo
# <int> <int> <int> <int> <int> <int> <int> <int> <int>
# 1 17 10 24 7 23 2 22 12 4
# 2 5 4 15 20 14 19 6 11 2
# 3 1 18 8 9 21 3 16 13 3
Sample data:
set.seed(42)
df <- data.frame(matrix(sample(24), nrow=3))
df
# X1 X2 X3 X4 X5 X6 X7 X8
# 1 17 10 24 7 23 2 22 12
# 2 5 4 15 20 14 19 6 11
# 3 1 18 8 9 21 3 16 13

Add new columns with defined values in R

I have a data.table named dmat. I want to add each character of missing_snps to dmat as new column and assign all rows as zero. The output remains in the same class as it was.
I would appreciate any suggestion.
dmat <- structure(list(`1:27950613:G:A` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), `1:27950883:CTA:C` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), `1:27952180:A:G` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), `1:27953106:A:G` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), `1:27953374:G:T` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), `1:27953514:T:TA` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), `1:27953608:T:C` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), `1:27954027:G:A` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), `1:27954415:T:C` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), `1:27962685:T:C` = c(0L, 0L, 0L, 0L, 0L, 1L, 0L,
0L, 0L, 0L)), row.names = c(NA, -10L), class = c("tbl_df", "tbl",
"data.frame"))
missing_snps <- c("1:169858888:G:A", "1:16985867657:T:A", "1:132862874:G:A")
dmat[,c("1:169858888:G:A", "1:16985867657:T:A", "1:132862874:G:A")] <- 0
or dmat[, missing_snps] <- 0
Using data.table,
dmat <- setDT(dmat)
missing_snps <- c("1:169858888:G:A", "1:16985867657:T:A", "1:132862874:G:A")
dmat[,(missing_snps ):=0]
Output
> dmat[,..missing_snps ]
1:169858888:G:A 1:16985867657:T:A 1:132862874:G:A
1: 0 0 0
2: 0 0 0
3: 0 0 0
4: 0 0 0
5: 0 0 0
6: 0 0 0
7: 0 0 0
8: 0 0 0
9: 0 0 0
10: 0 0 0
The columns you want to mutate has been added.

Aggregating column according to their names

French student here, so my english's not that great, sorry.
We transformed a data set with species and their locations, to the corresponding origin of theses species and their locations.
The data set has 600~ columns, named U, A, W, L or E (species origin), inside of which a 0 or 1 (presence / absence of a species at location)
and 2 columns with coordonates (corresponding to the data collecting station).
More than 8000 lines, for each station where data was found.
A simplification of the data set would like that :
[Longitude] [Latitude] [A][U][U][L][E][A][U] ... [+600]
[1,] -5.89 35.71 0 0 1 0 0 1 1
[2,] -5.89 35.81 0 1 0 0 0 0 1
[3,] -5.89 36.01 1 0 0 1 1 1 0
[4,] -5.89 36.1 0 0 0 1 0 1 0
[1,] -5.89 36.21 1 1 1 0 0 1 1
[2,] -5.79 35.81 1 1 0 1 0 1 0
[3,] -5.79 35.91 0 1 0 0 0 0 1
[4,] -5.79 36.01 1 1 0 1 0 1 0
[+8000]
What we want to do is to some sort of conditional sum, where all origin are regrouped into one column each and their content summed , like so :
`
[Longitude] [Latitude] [A][U][L][W][E]
[1,] -5.89 35.71 12 6 5 0 13
[2,] -5.89 35.81 5 1 8 10 20
[3,] -5.89 36.01 1 28 3 6 2
[4,] -5.89 36.1 4 25 0 1 11
[1,] -5.89 36.21 9 1 9 3 5
[2,] -5.79 35.81 6 5 12 1 8
[3,] -5.79 35.91 5 2 7 15 10
[4,] -5.79 36.01 10 3 5 12 4
[+8000]
Only the A,U,L,E,W must be summed.
Longitude, Latitude and number of rows must ne kept the same.
We tried aggregate or tapply, without success, but maybe a loop is needed...
Any ideas ?
a capture of the data set
MacOs answer
MacOS answer 2
Thanks
MacOS function : espOri => df espagg => df.agg
espagg <- aggregate(. ~ Longitude + Latitude,
especeOri,
FUN = sum)
aggregate.columns <- function(especeOri, column.names)
{
for (column.name in column.names) {
especeOri[[column.name]] <- rowSums(subset(espagg, select = grep(paste(column.name, ".*", sep = ""), colnames(especeOri))))
}
return(especeOri)
}
aggregate.column.names <- c("A", "U", "L", "E", "W")
espagg <- aggregate.columns(espagg, aggregate.column.names)
espagg <- subset(especeOri, select = c("Longitude", "Latitude", aggregate.column.names))
View(espagg)
dput of the data set
dput(especeOri[1:10,1:20])
structure(list(Longitude = c(-5.89, -5.89, -5.89, -5.89, -5.89,
-5.79, -5.79, -5.79, -5.79, -5.69), Latitude = c(35.71, 35.81,
36.01, 36.11, 36.21, 35.81, 35.91, 36.01, 36.11, 35.81), L = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), U = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L), U.1 = c(0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 1L), A = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), U.2 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), E = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L), U.3 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), E.1 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), U.4 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), U.5 = c(0L, 0L, 0L, 0L,
1L, 0L, 0L, 0L, 0L, 0L), U.6 = c(1L, 0L, 0L, 0L, 0L, 1L, 0L,
0L, 0L, 1L), L.1 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
U.7 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L), U.8 = c(0L,
0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L), U.9 = c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), U.10 = c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), A.1 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), U.11 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L)), row.names = c(NA, 10L), class = "data.frame")
google drive with all the data sets, a few explanations and our script.
https://drive.google.com/drive/folders/1fnWnZZDC3gyWTtSoqi_l7Wuhje5qpJmL?usp=sharing
EDIT : added some values for longitude and latitude to illustrate and a screenshot
Here is a tidyverse solution using the data you provided.
library(dplyr)
library(tidyr)
fish <- read.table("Data_fish.txt", header = T)
traits <- read.table("Data_traits.txt", header = T)
fish %>%
pivot_longer(-c(ID_cellule, Longitude, Latitude), names_to = "Species", values_to = "Occur") %>%
mutate(ID_cellule = factor(ID_cellule, levels = unique(ID_cellule))) %>% # use factor to fix the display order as-is
left_join(traits %>% select(Species, Origin), by = "Species") %>%
group_by(ID_cellule, Longitude, Latitude, Origin) %>%
summarise(Occur = sum(Occur)) %>%
pivot_wider(names_from = "Origin", values_from = "Occur")
Output
# A tibble: 8,154 x 8
# Groups: ID_cellule, Longitude, Latitude [8,154]
ID_cellule Longitude Latitude A E L U W
<fct> <dbl> <dbl> <int> <int> <int> <int> <int>
1 ID1 -5.89 35.7 8 10 0 178 0
2 ID2 -5.89 35.8 11 10 0 234 0
3 ID3 -5.89 36.0 9 11 0 195 0
4 ID4 -5.89 36.1 12 10 0 227 0
5 ID5 -5.89 36.2 13 17 0 268 0
6 ID6 -5.79 35.8 9 8 0 205 0
7 ID7 -5.79 35.9 8 9 0 168 0
8 ID8 -5.79 36.0 11 14 0 262 0
9 ID9 -5.79 36.1 10 10 0 193 0
10 ID10 -5.69 35.8 9 10 0 230 0
The following should do the job.
df <- data.frame(Longitude = c(-5.89, -5.89, -5.89, -5.89, -5.89, -5.79, -5.79, -5.79, -5.89, -5.89),
Latitude = c(35.71, 35.81, 36.01, 36.1, 36.21, 35.81, 35.91, 36.01, 35.71, 35.81),
A = c(0, 0, 1, 0, 1, 1, 0, 1, 1, 1),
U = c(0, 1, 0, 0, 1, 1, 1, 1, 1, 1),
U = c(1, 0, 0, 0, 1, 0, 0, 0, 1, 1),
L = c(0, 0, 1, 1, 0, 1, 0, 1, 1, 1),
E = c(0, 0, 1, 0, 0, 0, 0, 0, 1, 1),
A = c(1, 0, 1, 1, 1, 1, 0, 1, 1, 1),
U = c(1, 1, 0, 0, 1, 0, 1, 0, 1, 1))
df.agg <- aggregate(. ~ Longitude + Latitude,
df,
FUN = sum)
df.agg$A <- rowSums(subset(df.agg, select = grep("A.*", colnames(df.agg))))
df.agg$U <- rowSums(subset(df.agg, select = grep("U.*", colnames(df.agg))))
df.agg$L <- rowSums(subset(df.agg, select = grep("L.*", colnames(df.agg))))
df.agg$E <- rowSums(subset(df.agg, select = grep("E.*", colnames(df.agg))))
df.agg <- subset(df.agg, select = c(Longitude, Latitude, A, U, L, E))
Update
The OP user asked for a solution where he/she does not have to write the code for rowSums explicitely, because he/she has to many columns to actually write it out, i.e. it is inconvinient. The following should do the job.
df <- structure(list(Longitude = c(-5.89, -5.89, -5.89, -5.89, -5.89,
-5.79, -5.79, -5.79, -5.79, -5.69), Latitude = c(35.71, 35.81,
36.01, 36.11, 36.21, 35.81, 35.91, 36.01, 36.11, 35.81), L = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), U = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L), U.1 = c(0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 1L), A = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), U.2 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), E = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L), U.3 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), E.1 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), U.4 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), U.5 = c(0L, 0L, 0L, 0L,
1L, 0L, 0L, 0L, 0L, 0L), U.6 = c(1L, 0L, 0L, 0L, 0L, 1L, 0L,
0L, 0L, 1L), L.1 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
U.7 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L), U.8 = c(0L,
0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L), U.9 = c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), U.10 = c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), A.1 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), U.11 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L)), row.names = c(NA, 10L), class = "data.frame")
df.agg <- aggregate(. ~ Longitude + Latitude,
df,
FUN = sum)
# This function aggregates rows if their column names have the same start.
# Suppose we have a data frame with column names A, A.1, and A.2. Then,
# the rows of these columns are aggregated using sum. So,
# A 1 1 0
# A.1 2 1 0
# A.2 0 0 1
# becomes
# A 3 2 1
aggregate.columns <- function(df.my, column.names)
{
for (column.name in column.names) {
df.my[[column.name]] <- df.my[[column.name]] +
rowSums(subset(df.my,
select = grep(paste(column.name, ".[1-9]+", sep = ""),
colnames(df.my))))
}
return(df.my)
}
aggregate.column.names <- c("A", "U", "L", "E")
df.agg <- aggregate.columns(df.agg, aggregate.column.names)
df.agg <- subset(df.agg, select = c("Longitude", "Latitude", aggregate.column.names))
df.agg
The key to making this work is this line.
grep(paste(column.name, ".[1-9]+", sep = ""), colnames(df.my))
It returns all column names that have start with the current value of variable column.name followed by a dot and any sequence of digits, e.g. when the value of column.name is A then A.1, A.345, A.67, A.9798, A.111111 should all be returned. Please check!
Update 3
After the user of the OP provided the data, I did come up with the following. This includes a function for renaming. This is necessary since the data frame has columns with identical names. For example, this function transforms a sequence of column names A, A, A, A into A, A.1, A.2, A.3.
climate <- read.table("Data_climate.txt", header = T)
poissons <- read.table("Data_fish.txt", header = T)
traitsNA <- read.table("Data_traits.txt", header = T)
especes <- poissons [,-2]
especes2 <- especes [,-2]
especes3 <- especes2 [,-1]
colnames(especes3) <- traitsNA$Origin
especes44<-cbind(climate$Latitude,especes3)
especeOri <- cbind(climate$Longitude,especes44)
origine <- cbind(climate$ID_cellule,especeOri)
colnames(origine)[1] <- "ID_cellule"
colnames(origine)[2] <- "Longitude"
colnames(origine)[3] <- "Latitude"
colnames(especeOri)[1] <- "Longitude"
colnames(especeOri)[2] <- "Latitude"
rename.columns <- function(df)
{
unique.column.names <- unique(colnames(df))
for (unique.column.name in unique.column.names)
{
idxs.columns <- which(colnames(df) == unique.column.name)
df.tmp.with.new.col.names <- subset(df, select = idxs.columns)
colnames(df)[idxs.columns] <- colnames(df.tmp.with.new.col.names)
}
return(df)
}
especeOri <- rename.columns(especeOri)
espagg <- aggregate(. ~ Longitude + Latitude,
especeOri,
FUN = sum)
# This function aggregates rows if their column names have the same start.
# Suppose we have a data frame with column names A, A.1, and A.2. Then,
# the rows of these columns are aggregated using sum. So,
# A 1 1 0
# A.1 2 1 0
# A.2 0 0 1
# becomes
# A 3 2 1
aggregate.columns <- function(df.my, column.names)
{
for (column.name in column.names) {
df.my[[column.name]] <- df.my[[column.name]] +
rowSums(subset(df.my,
select = grep(paste(column.name, ".[1-9]+",
sep = ""),
colnames(df.my))))
}
return(df.my)
}
aggregate.column.names <- c("A", "U", "L", "E", "W")
espagg <- aggregate.columns(espagg, aggregate.column.names)
espagg <- subset(especeOri, select = c("Longitude", "Latitude", aggregate.column.names))
HTH!

Extract the column names for each row which meets a condition [duplicate]

This question already has answers here:
For each row return the column name of the largest value
(10 answers)
Closed 4 years ago.
d <- structure(
list(
Cl = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
SaCl = c(0, 1, 0, 0,0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0),
SiCl = c(0L,0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,0L, 0L, 0L),
ClLo = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
SiClLo = c(0L, 0L, 0L,0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
SaClLo = c(1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1),
SaLo = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
SaSiLo = c(0L, 0L,0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
SiLo = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
LoSa = c(0L, 0L, 0L, 0L,0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
Sa = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,0L, 0L, 0L, 0L, 0L, 0L, 0L)
),
row.names = c(NA, 20L),
class = "data.frame"
)
Each row has only one 1. I want to extract the column name which has 1 for each row such that my dataframe looks like
row.id | names
-------+-------
1 | SaClLo
2 | SaCl
3 | SaClLo
4 | SaClLo
I tried to run a function to each row
apply(d, 1, function(x) colnames(x)[x == 1])
This is giving me NULL.
Use max.col to find the positions of the 1s and use this vector to select the respective column names.
data.frame(row.id = 1:nrow(d),
names = names(d)[max.col(d)])
# row.id names
#1 1 SaClLo
#2 2 SaCl
#3 3 SaClLo
#4 4 SaClLo
#...
For each row, we find which column has a value of 1, then select the value of colnames for that row. Then we convert it to a data.frame
data.frame(names = apply(d, 1, function(x) colnames(d)[which(x == 1)]))
names
1 SaClLo
2 SaCl
3 SaClLo
4 SaClLo
...
Optionally, you can run it through tibble::rowname_to_column() to change the row.id from rownames to a column.
data.frame(names = apply(d, 1, function(x) colnames(d)[which(x == 1)])) %>%
tibble::rownames_to_column()
rowname names
1 1 SaClLo
2 2 SaCl
3 3 SaClLo
4 4 SaClLo
...
A little-known feature of which is your friend:
> which(d==1, arr.ind=TRUE)
row col
2 2 2
11 11 2
15 15 2
13 13 4
...
The second column is the information you need:
> arr_indices <- which(d == 1, arr.ind = TRUE)
> colnames(d)[ arr_indices[, 2] ]
[1] "SaCl" "SaCl" "SaCl" "ClLo" "SaClLo" "SaClLo" "SaClLo" "SaClLo"
[9] "SaClLo" "SaClLo" "SaClLo" "SaClLo" "SaClLo" "SaClLo" "SaClLo" "SaClLo"
[17] "SaClLo" "SaClLo" "SaClLo" "SaClLo"
And you can put this into a data frame or whatever. I like this answer because it is relatively easy-to-read code.

retrieving names(table) following subset where only 1 observation

My issue is that when I try to retrieve names(myresults) after subsetting a table I get null when the returned subset has only 1 result. Rather than returning a character vector of row names r returns an integer (in this case of 1).
Here is a table
head(tbl)
1 2 3 4 5 6
afford 0 1 0 0 0 0
app 0 0 0 1 0 0
back 0 1 0 0 0 0
cancel 0 0 0 0 1 0
charg 0 0 0 0 0 1
download 0 0 0 0 0 1
I have been subsetting the table within a loop to return a table for each group. If a term belongs to a group it has a value of 1:
for (i in 1:ncol(tbl)) {
t <- tbl[which(tbl[,i]==1),i]
nam <- names(t)
df <- as.data.frame(nam)
names(df) <- paste0("Cluster ",i)
print(kable(df))
}
This loop seems to work OK when there are more than one instance of a term returned by which(). But the group 4, which has only 1 term "app" gives me issues. Here's an example on group 3, which works as expected then on group 4, which does not:
> t <- tbl[which(tbl[,4]==1),4] # only 1 observation meets this criteria
> t
[1] 1
> t <- tbl[which(tbl[,3]==1),3] # 3 observations meet this criteria
> t
aword cat dog
1 1 1
So I can get names(t) for tbl[,3] where it has 3 returned instances but not for tbl[,4] which only has 1.
> t <- fintab[which(fintab[,4]==1),4]
> names(t)
NULL # expected "app"
> t <- fintab[which(fintab[,4]==1),4]
> names(t)
[1] "aword" "cat" "dog"
How can I get names(t) when I have only 1 instance returned like in the example?
Some further context following comment below:
> str(tbl)
'table' int [1:33, 1:6] 0 0 0 0 0 0 0 0 0 0 ...
- attr(*, "dimnames")=List of 2
..$ : chr [1:33] "aword" "app" "cat" "dog" ...
..$ : chr [1:6] "1" "2" "3" "4" ...
>
and
> dput(tbl)
structure(c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L,
1L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 0L, 1L, 0L,
1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L,
1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L,
0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L,
0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 1L), .Dim = c(33L, 6L), .Dimnames = structure(list(
c("aword", "app", "back", "cancel", "charg", "download",
"enough", "expens", "get", "great", "just", "like", "love",
"cat", "dog", "bla", "month", "much", "need",
"never", "phone", "pleas", "blabla", "realli", "term", "sign",
"thank", "time", "triangle", "use", "want", "will", "work"), c("1",
"2", "3", "4", "5", "6")), .Names = c("", "")), class = "table")
As we are subsetting a single column, we get the logical index (tbl[,4] ==1 - no need to wrap with which unless there are NAs. In that case, the which remove those NAs) and use that to subset the column vector.
tbl[,4][tbl[,4]==1]
# app
# 1
tbl[,3][tbl[,3]==1]
# cat blabla time
# 1 1 1

Resources