Related
The general idea of this question has been asked here
However the answer did not work in my specific case since I want to use a third input into the function, which is a large dataframe. I have also tried using sapply as per this post but that still does not work.
My goal is to avoid having to create the new columns by hand/use a for loop to append into the new columns. Is this possible in R? Is there another more 'R' way to structure my data and/or function? I looked at purrr::pmap but I don't know how to have it output multiple columns
Here is my minimal reproducible example:
library(tidyverse)
find_sample_gaps<-function(site, analyte, df){
Sample <- df%>%
filter(site_code == site)%>%filter(analyte_code == analyte)%>%
mutate(Year = as.numeric(format(Date, '%Y')))
x<-Sample%>%
group_by(Year)%>%
summarize(n_samples = length(Year))
gaps<-which(c(1,diff(x$Year))>1)
a<-sum(x$n_samples)
b<-length(unique(Sample$Date))
c<-'No gaps'
if(length(gaps)>0){
c<-paste('There are', as.character(gaps), 'gaps')
}
return(cbind(a,b,c))
}
# use function inside cbind to add columns to dataframe
result<-cbind(output1, find_sample_gaps(output1$site_code, output1$analyte_code, output2)) # throws error because output2 dataframe isn't the same size as result?
# another attempt also using cbind with sapply
result<-cbind(output1, t(sapply(c(output1$site_code, output1$analyte_code, output2), find_sample_gaps))) # also throws error, does not recognize the inputs into the function?
Here is my input data:
output1<-structure(list(site_code = c("a", "b", "c", "d", "e", "f", "g",
"h", "i", "j", "j", "j", "j", "j", "j", "j", "k", "k", "k", "k",
"k", "k", "k", "l", "l", "l", "l", "l", "l", "m", "n", "o", "p",
"q", "r", "s", "t", "u", "v", "w", "w", "w", "w", "w", "x", "x",
"x", "x", "x", "y", "y", "y", "z", "z", "z", "z", "z", "aa",
"aa", "aa", "aa", "aa", "aa", "aa", "bb", "bb", "bb", "bb", "bb",
"cc", "cc", "cc", "cc", "cc", "dd", "dd", "dd", "dd", "dd", "ee",
"ee", "ee", "ee", "ee", "ee", "ee", "ff", "ff", "ff", "ff", "ff",
"gg", "gg", "gg", "gg", "gg", "hh", "hh", "hh", "hh", "hh", "hh",
"ii", "ii", "ii", "ii", "ii", "ii", "jj", "jj", "jj", "jj", "jj",
"jj", "jj"), analyte_code = c("a", "a", "a", "a", "a", "a", "a",
"a", "a", "b", "c", "d", "e", "a", "f", "g", "b", "c", "d", "e",
"a", "f", "g", "c", "d", "e", "a", "f", "g", "a", "a", "a", "a",
"a", "a", "a", "a", "a", "a", "d", "e", "a", "f", "g", "d", "e",
"a", "f", "g", "a", "f", "g", "d", "e", "a", "f", "g", "b", "c",
"d", "e", "a", "f", "g", "d", "e", "a", "f", "g", "d", "e", "a",
"f", "g", "d", "e", "a", "f", "g", "b", "c", "d", "e", "a", "f",
"g", "d", "e", "a", "f", "g", "d", "e", "a", "f", "g", "c", "d",
"e", "a", "f", "g", "c", "d", "e", "a", "f", "g", "b", "c", "d",
"e", "a", "f", "g")), row.names = c(NA, -115L), class = c("tbl_df",
"tbl", "data.frame"))
output2<-structure(list(site_code = c("dd", "k", "k", "r", "aa", "ii",
"y", "l", "l", "l", "q", "cc", "w", "bb", "c", "ff", "m", "ii",
"p", "ff", "ff", "z", "ff", "l", "w", "hh", "ff", "ff", "ff",
"k", "j", "bb", "x", "hh", "jj", "z", "dd", "q", "aa", "k", "bb",
"r", "e", "j", "j", "ii", "y", "hh", "p", "p", "u", "gg", "ff",
"p", "cc", "u", "dd", "n", "bb", "bb", "aa", "ff", "x", "k",
"w", "x", "j", "bb", "cc", "ii", "hh", "jj", "b", "hh", "y",
"u", "cc", "hh", "aa", "b", "jj", "hh", "gg", "y", "r", "a",
"aa", "aa", "z", "ff", "ee", "g", "hh", "hh", "cc", "hh", "hh",
"h", "l", "k"), analyte_code = c("e", "b", "b", "c", "f", "d",
"a", "a", "a", "d", "f", "c", "g", "a", "a", "e", "a", "e", "a",
NA, "c", "a", "d", "c", "d", "b", "a", "f", "a", "g", "b", "c",
"f", "f", "c", "a", "f", "a", "e", "g", "c", "a", "a", "b", "e",
"a", "e", "c", "a", "a", "a", "a", "b", "a", "e", "a", "f", "a",
"a", "a", "c", "e", "a", "e", "a", "c", "e", "c", "a", "e", "c",
"a", "a", "g", "c", "a", "b", "b", "f", "b", "e", "d", "d", "c",
"c", "a", "a", "b", "f", "f", "b", "a", "e", "g", "c", "a", "a",
"a", "e", "d"), Date = structure(c(13326, 14287, 14403, 17669,
16330, 18603, 17428, 15502, 18708, 13780, 17757, 18582, 18087,
18582, 17433, 13326, 17674, 13668, 18059, 17966, 16701, 17142,
14915, 16861, 13999, 15502, 15412, 16856, 14551, 18708, 12128,
14314, 13326, 12563, 13780, 17224, 17611, 15703, 16239, 13780,
12970, 16096, 16544, 17134, 18603, 13780, 18388, 15684, 19157,
18684, 17449, 18857, 15075, 18746, 12683, 15618, 17142, 18634,
15601, 17065, 15926, 12970, 17611, 16692, 13943, 12871, 16958,
13263, 13451, 16179, 13094, 15044, 18131, 12212, 15966, 16410,
14775, 13283, 16239, 16391, 17050, 13283, 16085, 16330, 17362,
18393, 18087, 13724, 14396, 14396, 17331, 19106, 14215, 13388,
14088, 18241, 18143, 17187, 13486, 12482), class = "Date")), row.names = c(NA,
100L), class = "data.frame")
Does this give you what you are after? Note I changed the return of the function to be a list.
find_sample_gaps<-function(site, analyte, df){
Sample <- df%>%
filter(site_code == site)%>%filter(analyte_code == analyte)%>%
mutate(Year = as.numeric(format(Date, '%Y')))
x<-Sample%>%
group_by(Year)%>%
summarize(n_samples = length(Year))
gaps<-which(c(1,diff(x$Year))>1)
a<-sum(x$n_samples)
b<-length(unique(Sample$Date))
c<-'No gaps'
if(length(gaps)>0){
c<-paste('There are', as.character(gaps), 'gaps')
}
comb <- list(a = a, b = b, c = c)
return(comb)
}
output3 <- output1 %>%
mutate(a = find_sample_gaps(site, analyte, all_of(output2))$a,
b = find_sample_gaps(site, analyte, all_of(output2))$b,
c = find_sample_gaps(site, analyte, all_of(output2))$c)
output3
site_code analyte_code a b c
<chr> <chr> <int> <int> <chr>
1 a a 1 1 No gaps
2 b a 1 1 No gaps
3 c a 1 1 No gaps
4 d a 1 1 No gaps
5 e a 1 1 No gaps
6 f a 1 1 No gaps
7 g a 1 1 No gaps
8 h a 1 1 No gaps
9 i a 1 1 No gaps
10 j b 1 1 No gaps
I have two data matrices of different dimensions stored as objects in R (I am using Rstudio with R v4.0.2 in Windows 10):
m1 = 1 column x 44 rows (this is a list of names with no spaces).
m2 = 500,000 columns x 164 rows (this contains a string of characters, the first row being a list of names).
I want to check how many (and which) of the rows of m1 are found in m2 (meaning it will be anywhere between 0 and 44). The end goal is that I have 4000 different matrices that will substitute the place of m2, and I need to see the extent of missing entries (found in m1) in all of the m2s (i.e., I am looking at the extent of missing data of those 44 names).
I am still a beginner to R, so apologies if my description is a bit off.
I tried storing each matrix, saved as CSV files, as such:
m1 <- read.csv("names-file.csv")
m2 <- read.csv("data-file.csv")
and tried to use the row.match function in the prodlim package, and ran row.match(m1, m2) but only got numeric values. I am looking to see just a number of how many of the names from m1 (first column) are found in m2 (first column), which values those are, and what the percentage would be (x out of 44).
As an example:
m1 =
Tom
Harry
Cindy
Megan
Jack
`
m2 =
Tom XXXXXXXXXXXX----XXXXXXXX
Stephanie XXXXXXXXXXXXXXXX----XXXX
Megan XXXXXXXXXXXXXXXXXXXXXXXX
Ryan XXXXXXXXXXXXXXXXXXXXXX-X
David XXXXXX---XXXXXXXXXXXXXXX
Josh XXXXXXXXXXXXXXXXXXXXXXXX
In the m2 matrix, each name is column 1, and the each subsequent X (which represents either an A, T, C, or G) are the subsequent columns (so some columns have an A, T, C, or G, or a "-"). I am looking to write a code that would see how many of the names from m1 and found in m2 (and conversely, how much data is missing from m2 as a percentage). In this case, the desired outputs would be:
2
Tom
Megan
60%
Here are my specific datafile using dput() (please let me know if I am using dput() correctly):
m1:
structure(list(V1 = c("Taxon1", "Taxon2", "Taxon3", "Taxon4",
"Taxon5", "Taxon6", "Taxon7", "Taxon8")), class = "data.frame", row.names = c(NA,
-8L))
m2:
structure(list(V1 = c("Taxon1", "Taxon3", "Taxon4", "Taxon6",
"Taxon7", "Taxon9", "Taxon10", "Taxon11", "Taxon12", "Taxon13",
"Taxon14", "Taxon15", "Taxon16", "Taxon17", "Taxon18", "Taxon19",
"Taxon20", "Taxon21", "Taxon22", "Taxon23", "Taxon24", "Taxon25",
"Taxon26", "Taxon27", "Taxon28", "Taxon29", "Taxon30"), V2 = c("A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "C", "C", "C", "C", "C", "C", "C"
), V3 = c("G", "G", "G", "G", "G", "C", "C", "G", "G", "G", "G",
"G", "G", "G", "G", "G", "G", "G", "G", "G", "G", "G", "G", "G",
"G", "G", "G"), V4 = c("C", "C", "C", "C", "C", "T", "G", "C",
"C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C",
"C", "C", "C", "C", "C", "C"), V5 = c("T", "T", "G", "T", "G",
"G", "G", "T", "T", "T", "T", "T", "T", "T", "T", "T", "T", "T",
"T", "T", "T", "T", "T", "T", "T", "T", "T"), V6 = c("G", "G",
"C", "G", "C", "C", "C", "G", "G", "G", "G", "G", "G", "G", "G",
"G", "G", "G", "G", "G", "G", "G", "G", "G", "G", "G", "G"),
V7 = c("C", "C", "A", "C", "A", "A", "A", "C", "C", "C",
"C", "C", "C", "C", "C", "C", "G", "G", "G", "G", "G", "G",
"G", "G", "G", "G", "G"), V8 = c("T", "T", "A", "T", "A",
"A", "A", "T", "T", "T", "T", "T", "T", "T", "T", "T", "T",
"T", "T", "T", "T", "T", "T", "T", "T", "T", "T"), V9 = c("A",
"A", "A", "A", "A", "T", "T", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "T", "T", "T", "T", "T", "T", "T", "T",
"T", "T")), class = "data.frame", row.names = c(NA, -27L))
Thank you!
You might want to have a look at the %in% operator in R. According to your question, you might want something like this:
m1[,1] %in% m2[,1]
#[1] TRUE FALSE TRUE TRUE FALSE TRUE TRUE FALSE
You can then pair it with functions such as mean or sum which will help you to find the percentage as required:
sum(m1[,1] %in% m2[,1])
#[1] 5
mean(m1[,1] %in% m2[,1])
#[1] 0.625
EDIT: As required by the OP in the comments of this post, there are various methods for that, my personal favourite being the which function:
m1[which(m1[,1] %in% m2[,1]),]
#[1] "Taxon1" "Taxon3" "Taxon4" "Taxon6" "Taxon7"
m1[which(!(m1[,1] %in% m2[,1])),]
#[1] "Taxon2" "Taxon5" "Taxon8"
Again, this is only one method, out of many (I can count 3 right now...), so I suggest you to explore the other options...
To get common names in both the dataframes you may use intersect, to calculate missing percentage you can use %in% with mean
common_names <- intersect(m1$V1, m2$V1)
missing_percentage_in_m1 <- mean(!m1$V1 %in% m2$V1) * 100
missing_percentage_in_m2 <- mean(!m2$V1 %in% m1$V1) * 100
common_names
#[1] "Taxon1" "Taxon3" "Taxon4" "Taxon6" "Taxon7"
missing_percentage_in_m1
#[1] 37.5
missing_percentage_in_m2
#[1] 81.48148
This code will get result like this
2
Tom
Megan
60%
1.how many of the names from m1 and found in m2
m1 <- t(m1)
res1 <-m2 %>%
rowwise %>%
mutate(n = m1 %in% c_across(V1:V9) %>% sum)
res1
# A tibble: 27 x 10
# Rowwise:
V1 V2 V3 V4 V5 V6 V7 V8 V9 n
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <int>
1 Taxon1 A G C T G C T A 1
2 Taxon3 A G C T G C T A 1
3 Taxon4 A G C G C A A A 1
4 Taxon6 A G C T G C T A 1
5 Taxon7 A G C G C A A A 1
6 Taxon9 A C T G C A A T 0
7 Taxon10 A C G G C A A T 0
8 Taxon11 A G C T G C T A 0
9 Taxon12 A G C T G C T A 0
10 Taxon13 A G C T G C T A 0
# ... with 17 more rows
res1 %>% select(n) %>% sum
[1] 5
res2 <-res1 %>%
filter(n >0) %>%
pull(V1) %>%
unique
res2
[1] "Taxon1" "Taxon3" "Taxon4" "Taxon6" "Taxon7"
2.how much data is missing from m2 as a percentage
res3 <- res2 %>% length
1 - res3 / length(unique(m2$V1))
[1] 0.8148148
I have some data in JSON format, that using jsonlite I was able to read into a data frame in R. The data I'm working with is in lists, where each list contains character vectors of different lengths. For example:
values
<list>
1 A
2 B
3 character(0)
4 C
5 c(A, C)
6 D
7 c(B, C)
8 c(D, E)
Or, to reproduce in full:
structure(list(values1 = list("C", "E", character(0), "C", character(0),
"C", c("D", "A"), c("D", "A"), "D", "D", character(0), "D",
"A", "E", "E", "A", "A", "A", "B", "A", "A", "A", "A", "D",
"E", "E", "A", character(0), "E", character(0), character(0),
"B", character(0), "C", "C", "C", "C", "C", character(0),
character(0), character(0), character(0), character(0), character(0),
character(0), character(0), "E", c("E", "D"), c("E", "D"),
"B", "E", "E", "A", "A", "B", "B", "B", "B", "B", "D", "D",
character(0), character(0), character(0), character(0), "B",
c("C", "A"), character(0), "A", "B", "B", "B", "B", "B",
"C", "C", character(0), character(0), character(0), character(0),
"E", "E", character(0), character(0), "B", "E", "A", "C",
"B", "C", "A", "C", "C", "C", "C", "C", "A", character(0),
"A", character(0), "A", "D", "B", "A", "C", "A", "A", "A",
"C", "A", "A", "B", "D", "D", character(0), character(0),
character(0), character(0), character(0), character(0), "C",
"B", character(0), "B", character(0), "B", "E", "D", c("C",
"E"), c("C", "E"), "D", "D", "C", "C", character(0), "C",
character(0), "C", "C", "D", "E", "E", "B", "B", "C", "C",
"B", "B", "E", character(0), character(0), character(0),
character(0), "B", "B", "E", "A", character(0), "B", "A",
character(0), "A", "D", "D", c("D", "A"), c("D", "A"), c("D",
"B"), c("D", "B"), character(0), "E", character(0), "E",
"E", "E", "E", character(0), "D", character(0), "E", "A",
"A", "A", "A", "A", "D", "D", c("B", "A"), c("B", "A"), "C",
character(0), character(0), "B", "E", "E", "B", c("E", "B"
), "A", "A", "B", "B", "D", "D", "A", "A", character(0),
"A", "C", character(0), "C", "C", "B", "B", "A", "A", "B",
"B", "A", "E", "C", "C", "D", "D", "D", c("C", "E"), character(0),
character(0), character(0), character(0), "E", c("E", "A"
), "E", character(0), character(0), "A", "D", "D", c("D",
"A"), c("D", "A"), character(0), character(0), character(0),
character(0), character(0), character(0), "B", "C", "C",
"C", "C", "B", "B", c("C", "E"), c("C", "E"), "E", "C", "C",
"C", c("E", "D", "B", "A"), c("E", "D", "B", "A"), character(0),
"A", character(0), "A", c("C", "A"), c("C", "A"), c("C",
"A"), "E", "E", "A", character(0), "C", c("E", "D"), c("E",
"D"), character(0), character(0), character(0), character(0),
"A", "A", "A", "A", "D", "E", c("C", "D"), "E", character(0),
character(0), character(0), "D", "D", character(0), "A",
"B", character(0), character(0), character(0), character(0),
"D", "D", "D", "E", "E", "D", "D", "B", "B", "B", "E", "D",
"C", "D", "C", "C", "E", "E", "A", character(0), character(0),
"B", character(0), "B", "B", "B", "B", character(0), "A",
"C", "C", "C", "D", "D", "D", character(0), "D", character(0),
"D", "B", "A", character(0), "B", "D", "A", "A", character(0),
"A", "D", "D", "E", "E", "B", character(0), character(0),
character(0), "C", "C", "C", "B", "B", "A", "D", c("C", "B"
), character(0), "D", "C", "C", character(0), character(0),
"D", "D", "D", c("B", "A"), "E", "A", "A", character(0),
"E", "C", "B", character(0), character(0), character(0),
character(0), "E", "E", "D", "C", "C", "E", "E", "E", "E",
character(0), "E", "E", "A", "B", "A", "A", "D", "E", "E",
"B", "B", character(0), character(0), "D", "D", "C", "D",
"D", "E", character(0), "E", character(0), "E", c("D", "B"
), character(0), "B", character(0), character(0), "D", character(0),
"D", "D", "D", "C", character(0), "E", "E", c("E", "B"),
c("E", "B"), "E", "E", "D", "D", "B", c("E", "A"), c("E",
"A"), c("C", "D"), c("C", "D"), c("C", "B"), c("C", "B"),
character(0), "C", "B"), values2 = list("C", "E", "C",
"C", "C", "C", c("D", "A"), c("D", "A"), "D", "D", "D", "D",
"A", "E", "E", "A", "A", "A", "B", "A", "A", "A", "A", "D",
"E", "E", "A", "E", "E", character(0), "B", "B", "C", "C",
"C", "C", "C", "C", c("E", "A"), c("E", "A"), c("E", "A"),
c("E", "A"), c("C", "A"), c("C", "A"), c("C", "A"), c("C",
"A"), "E", c("E", "D"), c("E", "D"), "B", "E", "E", "A",
"A", "B", "B", "B", "B", "B", "D", "D", c("C", "B"), c("C",
"B"), c("C", "B"), c("C", "B"), "B", c("C", "A"), character(0),
"A", "B", "B", "B", "B", "B", "C", "C", c("E", "D"), c("E",
"D"), c("E", "D"), c("E", "D"), "E", "E", character(0), character(0),
"B", "E", "A", "C", "B", "C", "A", "C", "C", "C", "C", "C",
"A", "A", "A", "A", "A", "D", "B", "A", "C", "A", "A", "A",
"C", "A", "A", "B", "D", "D", "E", "E", "E", "E", character(0),
character(0), "C", "B", "B", "B", "B", "B", "E", "D", c("C",
"E"), c("C", "E"), "D", "D", "C", "C", "C", "C", "C", "C",
"C", "D", "E", "E", "B", "B", "C", "C", "B", "B", "E", "B",
"B", "B", "B", "B", "B", "E", "A", "B", "B", "A", "A", "A",
"D", "D", c("D", "A"), c("D", "A"), c("D", "B"), c("D", "B"
), "E", "E", "E", "E", "E", "E", "E", "D", "D", "E", "E",
"A", "A", "A", "A", "A", "D", "D", c("B", "A"), c("B", "A"
), "C", character(0), character(0), "B", "E", "E", "B", c("E",
"B"), "A", "A", "B", "B", "D", "D", "A", "A", "A", "A", "C",
"C", "C", "C", "B", "B", "A", "A", "B", "B", "A", "E", "C",
"C", "D", "D", "D", c("C", "E"), "D", "D", "D", "D", "E",
c("E", "A"), "E", character(0), character(0), "A", "D", "D",
c("D", "A"), c("D", "A"), c("D", "A"), c("D", "A"), c("D",
"A"), c("D", "A"), c("D", "A"), c("D", "A"), "B", "C", "C",
"C", "C", "B", "B", c("C", "E"), c("C", "E"), "E", "C", "C",
"C", c("E", "D", "B", "A"), c("E", "D", "B", "A"), "A", "A",
"A", "A", c("C", "A"), c("C", "A"), c("C", "A"), "E", "E",
"A", "C", "C", c("E", "D"), c("E", "D"), "A", "A", "A", "A",
"A", "A", "A", "A", "D", "E", c("C", "D"), "E", character(0),
character(0), character(0), "D", "D", character(0), "A",
"B", c("D", "B"), c("D", "B"), c("D", "B"), c("D", "B"),
"D", "D", "D", "E", "E", "D", "D", "B", "B", "B", "E", "D",
"C", "D", "C", "C", "E", "E", "A", character(0), "B", "B",
"B", "B", "B", "B", "B", "A", "A", "C", "C", "C", "D", "D",
"D", "D", "D", "D", "D", "B", "A", "B", "B", "D", "A", "A",
"A", "A", "D", "D", "E", "E", "B", character(0), character(0),
character(0), "C", "C", "C", "B", "B", "A", "D", c("C", "B"
), "D", "D", "C", "C", character(0), "D", "D", "D", "D",
c("B", "A"), "E", "A", "A", character(0), "E", "C", "B",
"C", "C", "C", "C", "E", "E", "D", "C", "C", "E", "E", "E",
"E", "E", "E", "E", "A", "B", c("C", "E", "D", "B", "A"),
c("C", "E", "D", "B", "A"), "D", "E", "E", "B", "B", character(0),
character(0), "D", "D", "C", "D", "D", "E", "E", "E", "E",
"E", c("D", "B"), "B", "B", character(0), "D", "D", "D",
"D", "D", "D", "C", "E", "E", "E", c("E", "B"), c("E", "B"
), "E", "E", "D", "D", "B", c("E", "A"), c("E", "A"), c("C",
"D"), c("C", "D"), c("C", "B"), c("C", "B"), "C", "C", "B")), row.names = c(NA,
445L), class = "data.frame")
I would like to split this data up so that each value gets its own column:
1 2 3 4 5
<chr> <chr> <chr> <chr> <chr>
1 A
2 B
3
4 C
5 A C
6 D
7 B C
8 D E
Then, ultimately, get the data into a tidy format so that it's easy to filter by a column:
A B C D E
<logi> <logi> <logi> <logi> <logi>
1 TRUE FALSE FALSE FALSE FALSE
2 FALSE TRUE FALSE FALSE FALSE
3 FALSE FALSE FALSE FALSE FALSE
4 FALSE FALSE TRUE FALSE FALSE
5 TRUE FALSE TRUE FALSE FALSE
6 FALSE FALSE FALSE TRUE FALSE
7 FALSE TRUE TRUE FALSE FALSE
8 FALSE FALSE FALSE TRUE TRUE
That last step should be simple with mutate, it's the splitting I can't figure out. I'm aware of both tidyr separate and unnest_wider, but as far as I can tell those don't let me control which columns the vector is split into.
Assuming your data is something like this :
df <- structure(list(values = list("A", "B", character(0), "C", c("A",
"C"), "D", c("B", "C"), c("D", "E"))),
row.names = c(NA, -8L), class = "data.frame")
You can do :
library(dplyr)
library(tidyr)
df %>%
mutate(row = row_number()) %>%
unnest(values) %>%
complete(row = 1:max(row)) %>%
mutate(val = TRUE) %>%
pivot_wider(names_from = values, values_from = val, values_fill = FALSE) %>%
dplyr::select(-`NA`, -row)
# A B C D E
# <lgl> <lgl> <lgl> <lgl> <lgl>
#1 TRUE FALSE FALSE FALSE FALSE
#2 FALSE TRUE FALSE FALSE FALSE
#3 FALSE FALSE FALSE FALSE FALSE
#4 FALSE FALSE TRUE FALSE FALSE
#5 TRUE FALSE TRUE FALSE FALSE
#6 FALSE FALSE FALSE TRUE FALSE
#7 FALSE TRUE TRUE FALSE FALSE
#8 FALSE FALSE FALSE TRUE TRUE
Based on the dput, data, we can do
library(dplyr)
library(tidyr)
df1 %>%
mutate(rn = row_number()) %>%
pivot_longer(cols = -rn) %>%
unnest(value) %>%
pivot_wider(names_from = value, values_from = name,
values_fill = FALSE, values_fn = list(name = ~ length(.) > 0)) %>%
select(-rn)
# A tibble: 422 x 5
# C E D A B
# <lgl> <lgl> <lgl> <lgl> <lgl>
# 1 TRUE FALSE FALSE FALSE FALSE
# 2 FALSE TRUE FALSE FALSE FALSE
# 3 TRUE FALSE FALSE FALSE FALSE
# 4 TRUE FALSE FALSE FALSE FALSE
# 5 TRUE FALSE FALSE FALSE FALSE
# 6 TRUE FALSE FALSE FALSE FALSE
# 7 FALSE FALSE TRUE TRUE FALSE
# 8 FALSE FALSE TRUE TRUE FALSE
# 9 FALSE FALSE TRUE FALSE FALSE
#10 FALSE FALSE TRUE FALSE FALSE
# … with 412 more rows
I have a dataset where I'd like to run classIntervals(df$vol, 3, style="jenks") for every group and type combination within it.
The data looks somewhat like this.
data_sam <- data.frame( "group"=c( "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A"
), "type"=c( "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B",
"B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B" ), "index"=c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59
), "vol"=c(52,272,374,408,498,480,451,644,715,659,820,713,810,676,840,589,594,998,782,483,351,494,377,261,637,379,706,530,619,724,333,189,246,82,39,85,159,143,125,118,79,39,110,190,264,101,70,46,0,27,71,69,172,464,132,0,156,167,142,45,51,10,0,14,67,20,2,12,1,0,6,2,2,17,22,7,0,2,9,5,12,15,7,0,12,18,4,3,12,9,12,13,14,8,9,11,10,5,4,1,4,10,4,4,3,5,5,1,3,0,2,3,2,4,2,3,3,0,0,1,1,1,0,0,1,1,2,0,1,1,0,1,1,0,0,1,0,0,0,0,1,2,0,1,1
))
I would like to be able to see the bin results per group-type.
As per the data above, the following results are what should I get when I run classIntervals:
group A - type A
style: jenks
one of 2,628 possible partitions of this variable into 3 classes
[0,190] (190,530] (530,998]
53 17 16
group A - type B
style: jenks
one of 66 possible partitions of this variable into 3 classes
[0,2] (2,5] (5,14]
34 15 10
Is there a way that I can loop through the group types within data_sam for the bins? And, ideally view the results into a data.frame formatted in the following way.
group type count1 count2 count3 boundary1 boundary2 boundary3
A A 53 17 16 [0,190] (190,530] (530,998]
A B 34 15 10 [0,2] (2,5] (5,14]
Alternatively, I'm happy to see even the breaks within each group attached to every row on the data_sam.
I'm not sure what's possible here so please let me know.
Consider by, the object-oriented wrapper to tapply to run operations on subsets of factor(s). Specifically, you can have return a list of data frames to be binded together at end.
Below extracts the brks object of return value from the classIntervals call as docs mention, assumed to be a named vector where names are boundaries and values are counts.
df_list <- by(df, df[,c("group", "type")], function(sub) {
tryCatch({
res <- classIntervals(sub$vol, n=3, style="jenks")$brks
data.frame(group = sub$group[1],
type = sub$type[1],
count1 = res[1],
count2 = res[2],
count3 = res[3],
boundary1 = names(res)[1],
boundary2 = names(res)[2],
boundary3 = names(res)[3])
}, error = function(e) NA
)
})
final_df <- do.call(rbind, df_list)
This question already has answers here:
How to flatten a list to a list without coercion?
(7 answers)
Closed 7 years ago.
I have a very messy list with multiple levels in the form of:
[[1]]
[[1]][[1]]
[[1]][[1]][[1]]
[1] "D" "B" "A"
[[1]][[1]][[2]]
[1] "E" "B" "A"
[[1]][[2]]
[[1]][[2]][[1]]
[1] "D" "C" "A"
[[1]][[3]]
[[1]][[3]][[1]]
[1] "B" "D" "A"
....
[[5]][[2]][[2]]
[1] "D" "B" "E"
[[5]][[3]]
[1] "C" "E"
...
What is the easiest way to just get a list of the lowest level character vectors, so the first element would be "D""B""A" then the next would be "E""B""A" and so forth?
Thanks!
Edit:
Here's my list in dput format as requested. However, the nesting structure can change and the number of levels can increase. Thus any solution that works by using a presupposed number of levels is no good.
> dput(myResults)
list(list(list(c("D", "B", "A"), c("E", "B", "A")), list(c("D",
"C", "A")), list(c("B", "D", "A"), c("C", "D", "A"), c("E", "D",
"A")), list(c("B", "E", "A"), c("D", "E", "A"))), list(list(c("D",
"A", "B"), c("E", "A", "B")), c("C", "B"), list(c("A", "D", "B"
), c("E", "D", "B")), list(c("A", "E", "B"), c("D", "E", "B"))),
list(list(c("D", "A", "C")), c("B", "C"), list(c("A", "D",
"C")), c("E", "C")), list(list(c("B", "A", "D"), c("C", "A",
"D"), c("E", "A", "D")), list(c("A", "B", "D"), c("E", "B",
"D")), list(c("A", "C", "D")), list(c("A", "E", "D"), c("B",
"E", "D"))), list(list(c("B", "A", "E"), c("D", "A", "E")),
list(c("A", "B", "E"), c("D", "B", "E")), c("C", "E"),
list(c("A", "D", "E"), c("B", "D", "E"))))
Edit
There is a package rlist with a function list.flatten that does this
library(rlist)
list.flatten(yourLst)
A recursive solution (the order is changed though, ie. the leastly nested stuff comes out first)
unlst <- function(lst){
if (!any((inds <- sapply(lst, is.list)))) return(lst)
c(lst[!inds], unlst(unlist(lst[inds], rec=F)))
}
Try this function please.
unlist_messy_list <- function(cur_list){
if (is.atomic(cur_list)){
list(cur_list)
}else{
cl <- lapply(cur_list, unlist_messy_list)
Reduce(c, cl)
}
}
As you have not provided a sample data , I tested it with some cases made up by myself and it works.
unlist_messy_list(list())
unlist_messy_list(list(c(1,2,3), c(4,5,6), c(7,8,9)))
unlist_messy_list(list(c(1,2,3), list(c(4,5,6), c(7,8,9))))
unlist_messy_list(list(c(1,2,3), c(4,5,6), list(c(7,8,9), c(10,11,12))))
unlist_messy_list(list(c(1,2,3), list(c(4,5,6), c(7,8,9), list(10, c(11,12,13), 14, list(c(15,16))))))
I just tested it on your newly provided data, and it works fine. The output is (after dput):
list(c("D", "B", "A"), c("E", "B", "A"), c("D", "C", "A"), c("B", "D", "A"), c("C", "D", "A"), c("E", "D", "A"), c("B", "E", "A"), c("D", "E", "A"), c("D", "A", "B"), c("E", "A", "B"), c("C", "B"), c("A", "D", "B"), c("E", "D", "B"), c("A", "E", "B"), c("D", "E", "B"), c("D", "A", "C"), c("B", "C"), c("A", "D", "C"), c("E", "C"), c("B", "A", "D"), c("C", "A", "D"), c("E", "A", "D"), c("A", "B", "D"), c("E", "B", "D"), c("A", "C", "D"), c("A", "E", "D"), c("B", "E", "D"),c("B", "A", "E"), c("D", "A", "E"), c("A", "B", "E"), c("D", "B", "E"), c("C", "E"), c("A", "D", "E"), c("B", "D", "E"))