Related
I have a dataset with many columns and thousands of rows. I am trying to get another column, foo which has the frequency that a value above 100 occurs in a row.
structure(list(S026401.R1 = c(0L, 0L, 0L, 0L, 0L), S026404.R1 = c(0L,
0L, 0L, 0L, 0L), S026406.R1 = c(0L, 0L, 0L, 0L, 0L), S026409.R1 = c(0L,
0L, 0L, 0L, 0L), S026412.R1 = c(0L, 0L, 0L, 0L, 0L), S026413.R1 = c(0L,
0L, 0L, 0L, 0L), S026414.R1 = c(47L, 0L, 0L, 0L, 0L), S026415.R1 = c(0L,
0L, 0L, 0L, 0L), S026416.R1 = c(31L, 0L, 0L, 0L, 0L), S026419.R1 = c(0L,
0L, 0L, 0L, 0L), S026421.R1 = c(0L, 0L, 0L, 0L, 34L), S026422.R1 = c(0L,
0L, 0L, 0L, 0L), S026423.R1 = c(0L, 0L, 0L, 0L, 0L), S026427.R1 = c(0L,
0L, 0L, 0L, 0L), S026428.R1 = c(0L, 0L, 0L, 0L, 1049L), S026429.R1 = c(0L,
0L, 0L, 0L, 0L), S026430.R1 = c(0L, 0L, 0L, 0L, 0L), S026431.R1 = c(0L,
10L, 0L, 0L, 0L), S026432.R1 = c(0L, 0L, 0L, 0L, 0L), S026433.R1 = c(0L,
0L, 0L, 0L, 0L), S026434.R1 = c(0L, 0L, 0L, 0L, 0L), S026435.R1 = c(0L,
0L, 0L, 0L, 0L), S026438.R1 = c(0L, 0L, 0L, 0L, 0L), S026440.R1 = c(0L,
0L, 0L, 0L, 0L), S026444.R1 = c(0L, 0L, 0L, 0L, 0L), S026447.R1 = c(0L,
0L, 0L, 0L, 0L), S026450.R1 = c(0L, 0L, 0L, 0L, 0L), S026451.R1 = c(0L,
0L, 0L, 0L, 0L), S026453.R1 = c(0L, 0L, 53L, 0L, 0L), S026456.R1 = c(0L,
0L, 0L, 0L, 0L), S026457.R1 = c(0L, 0L, 0L, 0L, 0L), S026458.R1 = c(0L,
0L, 0L, 0L, 0L), S026461.R1 = c(0L, 0L, 0L, 0L, 0L), S026462.R1 = c(0L,
0L, 0L, 0L, 18L), S026463.R1 = c(153L, 0L, 0L, 0L, 0L), S026464.R1 = c(0L,
0L, 0L, 0L, 0L), S026466.R1 = c(0L, 0L, 0L, 0L, 0L), S026467.R1 = c(32L,
0L, 0L, 0L, 0L), S026469.R1 = c(0L, 0L, 0L, 0L, 0L), S026470.R1 = c(0L,
0L, 0L, 0L, 0L), S026471.R1 = c(0L, 0L, 0L, 0L, 0L), S026473.R1 = c(0L,
0L, 0L, 0L, 0L), S026474.R1 = c(0L, 0L, 0L, 0L, 0L), S026476.R1 = c(0L,
0L, 0L, 0L, 0L), S026477.R1 = c(780L, 0L, 0L, 0L, 0L), S026483.R1 = c(21L,
0L, 0L, 0L, 0L), S026484.R1 = c(0L, 0L, 0L, 0L, 0L), S026485.R1 = c(0L,
0L, 0L, 13L, 0L), S026488.R1 = c(0L, 0L, 0L, 0L, 0L), S026489.R1 = c(0L,
0L, 0L, 0L, 0L), S026490.R1 = c(60L, 0L, 0L, 0L, 0L), S026493.R1 = c(0L,
0L, 103L, 0L, 0L)), class = c("rowwise_df", "tbl_df", "tbl",
"data.frame"), row.names = c(NA, -5L), groups = structure(list(
.rows = structure(list(1L, 2L, 3L, 4L, 5L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -5L), class = c("tbl_df",
"tbl", "data.frame")))
what i've been trying is:
df %>%
rowwise() %>%
mutate(foo = sum(c(8:52>100), na.rm = TRUE))
but this returns all 0s in the new column foo
when i change >100 to a lower number, it does provide a frequency. however, there are many values above 100.
As suggested, use c_across with a range (or collection) of columns.
library(dplyr)
df %>%
rowwise() %>%
mutate(foo = sum(c_across(X1:X8) > 15)) %>%
ungroup()
# # A tibble: 3 x 9
# X1 X2 X3 X4 X5 X6 X7 X8 foo
# <int> <int> <int> <int> <int> <int> <int> <int> <int>
# 1 17 10 24 7 23 2 22 12 4
# 2 5 4 15 20 14 19 6 11 2
# 3 1 18 8 9 21 3 16 13 3
Sample data:
set.seed(42)
df <- data.frame(matrix(sample(24), nrow=3))
df
# X1 X2 X3 X4 X5 X6 X7 X8
# 1 17 10 24 7 23 2 22 12
# 2 5 4 15 20 14 19 6 11
# 3 1 18 8 9 21 3 16 13
I have a data.table named dmat. I want to add each character of missing_snps to dmat as new column and assign all rows as zero. The output remains in the same class as it was.
I would appreciate any suggestion.
dmat <- structure(list(`1:27950613:G:A` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), `1:27950883:CTA:C` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), `1:27952180:A:G` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), `1:27953106:A:G` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), `1:27953374:G:T` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), `1:27953514:T:TA` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), `1:27953608:T:C` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), `1:27954027:G:A` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), `1:27954415:T:C` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), `1:27962685:T:C` = c(0L, 0L, 0L, 0L, 0L, 1L, 0L,
0L, 0L, 0L)), row.names = c(NA, -10L), class = c("tbl_df", "tbl",
"data.frame"))
missing_snps <- c("1:169858888:G:A", "1:16985867657:T:A", "1:132862874:G:A")
dmat[,c("1:169858888:G:A", "1:16985867657:T:A", "1:132862874:G:A")] <- 0
or dmat[, missing_snps] <- 0
Using data.table,
dmat <- setDT(dmat)
missing_snps <- c("1:169858888:G:A", "1:16985867657:T:A", "1:132862874:G:A")
dmat[,(missing_snps ):=0]
Output
> dmat[,..missing_snps ]
1:169858888:G:A 1:16985867657:T:A 1:132862874:G:A
1: 0 0 0
2: 0 0 0
3: 0 0 0
4: 0 0 0
5: 0 0 0
6: 0 0 0
7: 0 0 0
8: 0 0 0
9: 0 0 0
10: 0 0 0
The columns you want to mutate has been added.
French student here, so my english's not that great, sorry.
We transformed a data set with species and their locations, to the corresponding origin of theses species and their locations.
The data set has 600~ columns, named U, A, W, L or E (species origin), inside of which a 0 or 1 (presence / absence of a species at location)
and 2 columns with coordonates (corresponding to the data collecting station).
More than 8000 lines, for each station where data was found.
A simplification of the data set would like that :
[Longitude] [Latitude] [A][U][U][L][E][A][U] ... [+600]
[1,] -5.89 35.71 0 0 1 0 0 1 1
[2,] -5.89 35.81 0 1 0 0 0 0 1
[3,] -5.89 36.01 1 0 0 1 1 1 0
[4,] -5.89 36.1 0 0 0 1 0 1 0
[1,] -5.89 36.21 1 1 1 0 0 1 1
[2,] -5.79 35.81 1 1 0 1 0 1 0
[3,] -5.79 35.91 0 1 0 0 0 0 1
[4,] -5.79 36.01 1 1 0 1 0 1 0
[+8000]
What we want to do is to some sort of conditional sum, where all origin are regrouped into one column each and their content summed , like so :
`
[Longitude] [Latitude] [A][U][L][W][E]
[1,] -5.89 35.71 12 6 5 0 13
[2,] -5.89 35.81 5 1 8 10 20
[3,] -5.89 36.01 1 28 3 6 2
[4,] -5.89 36.1 4 25 0 1 11
[1,] -5.89 36.21 9 1 9 3 5
[2,] -5.79 35.81 6 5 12 1 8
[3,] -5.79 35.91 5 2 7 15 10
[4,] -5.79 36.01 10 3 5 12 4
[+8000]
Only the A,U,L,E,W must be summed.
Longitude, Latitude and number of rows must ne kept the same.
We tried aggregate or tapply, without success, but maybe a loop is needed...
Any ideas ?
a capture of the data set
MacOs answer
MacOS answer 2
Thanks
MacOS function : espOri => df espagg => df.agg
espagg <- aggregate(. ~ Longitude + Latitude,
especeOri,
FUN = sum)
aggregate.columns <- function(especeOri, column.names)
{
for (column.name in column.names) {
especeOri[[column.name]] <- rowSums(subset(espagg, select = grep(paste(column.name, ".*", sep = ""), colnames(especeOri))))
}
return(especeOri)
}
aggregate.column.names <- c("A", "U", "L", "E", "W")
espagg <- aggregate.columns(espagg, aggregate.column.names)
espagg <- subset(especeOri, select = c("Longitude", "Latitude", aggregate.column.names))
View(espagg)
dput of the data set
dput(especeOri[1:10,1:20])
structure(list(Longitude = c(-5.89, -5.89, -5.89, -5.89, -5.89,
-5.79, -5.79, -5.79, -5.79, -5.69), Latitude = c(35.71, 35.81,
36.01, 36.11, 36.21, 35.81, 35.91, 36.01, 36.11, 35.81), L = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), U = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L), U.1 = c(0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 1L), A = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), U.2 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), E = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L), U.3 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), E.1 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), U.4 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), U.5 = c(0L, 0L, 0L, 0L,
1L, 0L, 0L, 0L, 0L, 0L), U.6 = c(1L, 0L, 0L, 0L, 0L, 1L, 0L,
0L, 0L, 1L), L.1 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
U.7 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L), U.8 = c(0L,
0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L), U.9 = c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), U.10 = c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), A.1 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), U.11 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L)), row.names = c(NA, 10L), class = "data.frame")
google drive with all the data sets, a few explanations and our script.
https://drive.google.com/drive/folders/1fnWnZZDC3gyWTtSoqi_l7Wuhje5qpJmL?usp=sharing
EDIT : added some values for longitude and latitude to illustrate and a screenshot
Here is a tidyverse solution using the data you provided.
library(dplyr)
library(tidyr)
fish <- read.table("Data_fish.txt", header = T)
traits <- read.table("Data_traits.txt", header = T)
fish %>%
pivot_longer(-c(ID_cellule, Longitude, Latitude), names_to = "Species", values_to = "Occur") %>%
mutate(ID_cellule = factor(ID_cellule, levels = unique(ID_cellule))) %>% # use factor to fix the display order as-is
left_join(traits %>% select(Species, Origin), by = "Species") %>%
group_by(ID_cellule, Longitude, Latitude, Origin) %>%
summarise(Occur = sum(Occur)) %>%
pivot_wider(names_from = "Origin", values_from = "Occur")
Output
# A tibble: 8,154 x 8
# Groups: ID_cellule, Longitude, Latitude [8,154]
ID_cellule Longitude Latitude A E L U W
<fct> <dbl> <dbl> <int> <int> <int> <int> <int>
1 ID1 -5.89 35.7 8 10 0 178 0
2 ID2 -5.89 35.8 11 10 0 234 0
3 ID3 -5.89 36.0 9 11 0 195 0
4 ID4 -5.89 36.1 12 10 0 227 0
5 ID5 -5.89 36.2 13 17 0 268 0
6 ID6 -5.79 35.8 9 8 0 205 0
7 ID7 -5.79 35.9 8 9 0 168 0
8 ID8 -5.79 36.0 11 14 0 262 0
9 ID9 -5.79 36.1 10 10 0 193 0
10 ID10 -5.69 35.8 9 10 0 230 0
The following should do the job.
df <- data.frame(Longitude = c(-5.89, -5.89, -5.89, -5.89, -5.89, -5.79, -5.79, -5.79, -5.89, -5.89),
Latitude = c(35.71, 35.81, 36.01, 36.1, 36.21, 35.81, 35.91, 36.01, 35.71, 35.81),
A = c(0, 0, 1, 0, 1, 1, 0, 1, 1, 1),
U = c(0, 1, 0, 0, 1, 1, 1, 1, 1, 1),
U = c(1, 0, 0, 0, 1, 0, 0, 0, 1, 1),
L = c(0, 0, 1, 1, 0, 1, 0, 1, 1, 1),
E = c(0, 0, 1, 0, 0, 0, 0, 0, 1, 1),
A = c(1, 0, 1, 1, 1, 1, 0, 1, 1, 1),
U = c(1, 1, 0, 0, 1, 0, 1, 0, 1, 1))
df.agg <- aggregate(. ~ Longitude + Latitude,
df,
FUN = sum)
df.agg$A <- rowSums(subset(df.agg, select = grep("A.*", colnames(df.agg))))
df.agg$U <- rowSums(subset(df.agg, select = grep("U.*", colnames(df.agg))))
df.agg$L <- rowSums(subset(df.agg, select = grep("L.*", colnames(df.agg))))
df.agg$E <- rowSums(subset(df.agg, select = grep("E.*", colnames(df.agg))))
df.agg <- subset(df.agg, select = c(Longitude, Latitude, A, U, L, E))
Update
The OP user asked for a solution where he/she does not have to write the code for rowSums explicitely, because he/she has to many columns to actually write it out, i.e. it is inconvinient. The following should do the job.
df <- structure(list(Longitude = c(-5.89, -5.89, -5.89, -5.89, -5.89,
-5.79, -5.79, -5.79, -5.79, -5.69), Latitude = c(35.71, 35.81,
36.01, 36.11, 36.21, 35.81, 35.91, 36.01, 36.11, 35.81), L = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), U = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L), U.1 = c(0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 1L), A = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), U.2 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), E = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L), U.3 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), E.1 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), U.4 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), U.5 = c(0L, 0L, 0L, 0L,
1L, 0L, 0L, 0L, 0L, 0L), U.6 = c(1L, 0L, 0L, 0L, 0L, 1L, 0L,
0L, 0L, 1L), L.1 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
U.7 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L), U.8 = c(0L,
0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L), U.9 = c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), U.10 = c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), A.1 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), U.11 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L)), row.names = c(NA, 10L), class = "data.frame")
df.agg <- aggregate(. ~ Longitude + Latitude,
df,
FUN = sum)
# This function aggregates rows if their column names have the same start.
# Suppose we have a data frame with column names A, A.1, and A.2. Then,
# the rows of these columns are aggregated using sum. So,
# A 1 1 0
# A.1 2 1 0
# A.2 0 0 1
# becomes
# A 3 2 1
aggregate.columns <- function(df.my, column.names)
{
for (column.name in column.names) {
df.my[[column.name]] <- df.my[[column.name]] +
rowSums(subset(df.my,
select = grep(paste(column.name, ".[1-9]+", sep = ""),
colnames(df.my))))
}
return(df.my)
}
aggregate.column.names <- c("A", "U", "L", "E")
df.agg <- aggregate.columns(df.agg, aggregate.column.names)
df.agg <- subset(df.agg, select = c("Longitude", "Latitude", aggregate.column.names))
df.agg
The key to making this work is this line.
grep(paste(column.name, ".[1-9]+", sep = ""), colnames(df.my))
It returns all column names that have start with the current value of variable column.name followed by a dot and any sequence of digits, e.g. when the value of column.name is A then A.1, A.345, A.67, A.9798, A.111111 should all be returned. Please check!
Update 3
After the user of the OP provided the data, I did come up with the following. This includes a function for renaming. This is necessary since the data frame has columns with identical names. For example, this function transforms a sequence of column names A, A, A, A into A, A.1, A.2, A.3.
climate <- read.table("Data_climate.txt", header = T)
poissons <- read.table("Data_fish.txt", header = T)
traitsNA <- read.table("Data_traits.txt", header = T)
especes <- poissons [,-2]
especes2 <- especes [,-2]
especes3 <- especes2 [,-1]
colnames(especes3) <- traitsNA$Origin
especes44<-cbind(climate$Latitude,especes3)
especeOri <- cbind(climate$Longitude,especes44)
origine <- cbind(climate$ID_cellule,especeOri)
colnames(origine)[1] <- "ID_cellule"
colnames(origine)[2] <- "Longitude"
colnames(origine)[3] <- "Latitude"
colnames(especeOri)[1] <- "Longitude"
colnames(especeOri)[2] <- "Latitude"
rename.columns <- function(df)
{
unique.column.names <- unique(colnames(df))
for (unique.column.name in unique.column.names)
{
idxs.columns <- which(colnames(df) == unique.column.name)
df.tmp.with.new.col.names <- subset(df, select = idxs.columns)
colnames(df)[idxs.columns] <- colnames(df.tmp.with.new.col.names)
}
return(df)
}
especeOri <- rename.columns(especeOri)
espagg <- aggregate(. ~ Longitude + Latitude,
especeOri,
FUN = sum)
# This function aggregates rows if their column names have the same start.
# Suppose we have a data frame with column names A, A.1, and A.2. Then,
# the rows of these columns are aggregated using sum. So,
# A 1 1 0
# A.1 2 1 0
# A.2 0 0 1
# becomes
# A 3 2 1
aggregate.columns <- function(df.my, column.names)
{
for (column.name in column.names) {
df.my[[column.name]] <- df.my[[column.name]] +
rowSums(subset(df.my,
select = grep(paste(column.name, ".[1-9]+",
sep = ""),
colnames(df.my))))
}
return(df.my)
}
aggregate.column.names <- c("A", "U", "L", "E", "W")
espagg <- aggregate.columns(espagg, aggregate.column.names)
espagg <- subset(especeOri, select = c("Longitude", "Latitude", aggregate.column.names))
HTH!
This question already has answers here:
For each row return the column name of the largest value
(10 answers)
Closed 4 years ago.
d <- structure(
list(
Cl = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
SaCl = c(0, 1, 0, 0,0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0),
SiCl = c(0L,0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,0L, 0L, 0L),
ClLo = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
SiClLo = c(0L, 0L, 0L,0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
SaClLo = c(1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1),
SaLo = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
SaSiLo = c(0L, 0L,0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
SiLo = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
LoSa = c(0L, 0L, 0L, 0L,0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
Sa = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,0L, 0L, 0L, 0L, 0L, 0L, 0L)
),
row.names = c(NA, 20L),
class = "data.frame"
)
Each row has only one 1. I want to extract the column name which has 1 for each row such that my dataframe looks like
row.id | names
-------+-------
1 | SaClLo
2 | SaCl
3 | SaClLo
4 | SaClLo
I tried to run a function to each row
apply(d, 1, function(x) colnames(x)[x == 1])
This is giving me NULL.
Use max.col to find the positions of the 1s and use this vector to select the respective column names.
data.frame(row.id = 1:nrow(d),
names = names(d)[max.col(d)])
# row.id names
#1 1 SaClLo
#2 2 SaCl
#3 3 SaClLo
#4 4 SaClLo
#...
For each row, we find which column has a value of 1, then select the value of colnames for that row. Then we convert it to a data.frame
data.frame(names = apply(d, 1, function(x) colnames(d)[which(x == 1)]))
names
1 SaClLo
2 SaCl
3 SaClLo
4 SaClLo
...
Optionally, you can run it through tibble::rowname_to_column() to change the row.id from rownames to a column.
data.frame(names = apply(d, 1, function(x) colnames(d)[which(x == 1)])) %>%
tibble::rownames_to_column()
rowname names
1 1 SaClLo
2 2 SaCl
3 3 SaClLo
4 4 SaClLo
...
A little-known feature of which is your friend:
> which(d==1, arr.ind=TRUE)
row col
2 2 2
11 11 2
15 15 2
13 13 4
...
The second column is the information you need:
> arr_indices <- which(d == 1, arr.ind = TRUE)
> colnames(d)[ arr_indices[, 2] ]
[1] "SaCl" "SaCl" "SaCl" "ClLo" "SaClLo" "SaClLo" "SaClLo" "SaClLo"
[9] "SaClLo" "SaClLo" "SaClLo" "SaClLo" "SaClLo" "SaClLo" "SaClLo" "SaClLo"
[17] "SaClLo" "SaClLo" "SaClLo" "SaClLo"
And you can put this into a data frame or whatever. I like this answer because it is relatively easy-to-read code.
My issue is that when I try to retrieve names(myresults) after subsetting a table I get null when the returned subset has only 1 result. Rather than returning a character vector of row names r returns an integer (in this case of 1).
Here is a table
head(tbl)
1 2 3 4 5 6
afford 0 1 0 0 0 0
app 0 0 0 1 0 0
back 0 1 0 0 0 0
cancel 0 0 0 0 1 0
charg 0 0 0 0 0 1
download 0 0 0 0 0 1
I have been subsetting the table within a loop to return a table for each group. If a term belongs to a group it has a value of 1:
for (i in 1:ncol(tbl)) {
t <- tbl[which(tbl[,i]==1),i]
nam <- names(t)
df <- as.data.frame(nam)
names(df) <- paste0("Cluster ",i)
print(kable(df))
}
This loop seems to work OK when there are more than one instance of a term returned by which(). But the group 4, which has only 1 term "app" gives me issues. Here's an example on group 3, which works as expected then on group 4, which does not:
> t <- tbl[which(tbl[,4]==1),4] # only 1 observation meets this criteria
> t
[1] 1
> t <- tbl[which(tbl[,3]==1),3] # 3 observations meet this criteria
> t
aword cat dog
1 1 1
So I can get names(t) for tbl[,3] where it has 3 returned instances but not for tbl[,4] which only has 1.
> t <- fintab[which(fintab[,4]==1),4]
> names(t)
NULL # expected "app"
> t <- fintab[which(fintab[,4]==1),4]
> names(t)
[1] "aword" "cat" "dog"
How can I get names(t) when I have only 1 instance returned like in the example?
Some further context following comment below:
> str(tbl)
'table' int [1:33, 1:6] 0 0 0 0 0 0 0 0 0 0 ...
- attr(*, "dimnames")=List of 2
..$ : chr [1:33] "aword" "app" "cat" "dog" ...
..$ : chr [1:6] "1" "2" "3" "4" ...
>
and
> dput(tbl)
structure(c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L,
1L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 0L, 1L, 0L,
1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L,
1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L,
0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L,
0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 1L), .Dim = c(33L, 6L), .Dimnames = structure(list(
c("aword", "app", "back", "cancel", "charg", "download",
"enough", "expens", "get", "great", "just", "like", "love",
"cat", "dog", "bla", "month", "much", "need",
"never", "phone", "pleas", "blabla", "realli", "term", "sign",
"thank", "time", "triangle", "use", "want", "will", "work"), c("1",
"2", "3", "4", "5", "6")), .Names = c("", "")), class = "table")
As we are subsetting a single column, we get the logical index (tbl[,4] ==1 - no need to wrap with which unless there are NAs. In that case, the which remove those NAs) and use that to subset the column vector.
tbl[,4][tbl[,4]==1]
# app
# 1
tbl[,3][tbl[,3]==1]
# cat blabla time
# 1 1 1