R find index of a variable and subset a lsit - r

I have a list that looks like this
#Make dataframes
df1 = data.frame(x = c("a", "b", "c"), y = 1:3, stringsAsFactors = F)
df2 = df1 %>% mutate(y = y*2)
df3 = df1 %>% mutate(y = y*3)
#Make a name for each dataframe
myvar = "fname"
#Combine name and dataframe into a list
mylist = list(myvar, df1)
#Add the other dataframes and name to the list (done in a loop for my bigger dataset
list2 = list(myvar, df2)
mylist = rbind(mylist, list2)
list3 = list(myvar, df3)
mylist = rbind(mylist, list3)
I want to pull a subset of the list with all the data associated with "c"
x y
3 c 3
x y
3 c 6
x y
3 c 9
This is what I tried but it doesn't work
#Find all instances of "c"
picksite = "c"
site_indices = which(mylist[,2] == picksite)
mylist[site_indices,]
Any suggestions on how to do this, or even a link to better understand lists? Thanks so much.

Wrapping the which inside of lapply will solve this problem:
lapply(mylist[,2], FUN = function(i) i[which(i$x == "c"),])
$mylist
x y
3 c 3
$list2
x y
3 c 6
$list3
x y
3 c 9

Using tidyverse, we can loop over the list with map and use if_any to filter
library(dplyr)
library(purrr)
map(mylist[,2], ~ .x %>%
filter(if_any(everything(), ~ .x == "c")))
-output
$mylist
x y
1 c 3
$list2
x y
1 c 6
$list3
x y
1 c 9

Related

Apply function on multiple lists in R

I have four lists each with multiple data frames.
I need to apply the same function on the lists.
How can I do this?
Sample data:
df1 <- data.frame(x = 1:3, y = letters[1:3])
df2 <- data.frame(x = 4:6, y = letters[4:6])
df3 <- data.frame(x = 7:9, y = letters[7:9])
df4 <- data.frame(x = 10:12, y = letters[10:12])
list1 <- list(df1,df2)
list2 <- list(df3,df4)
In my real data I import based on a pattern in the filename and thus my list elements will have the following names (sample data):
names(list1) <- c("./1. Data/df1.csv", "./1. Data/df2.csv")
names(list2) <- c("./1. Data/df3.csv", "./1. Data/df4.csv")
And this is one of the functions I want to run on all lists.
element.name <- function(x) {
all_filenames <- names(x) %>%
basename() %>%
as.list()
names(x) <- all_filenames
names(x) <- gsub("\\.csv", "", names(x))
}
which will give the desired output
names(list1) <- element.name(list1)
names(list1)
[1] "df1" [2] "df2"
I've tried using a for loop but I end up overwriting my output, so I hope some of you can help me out, since I need to run a lot of functions on my lists.
You could create a list of your lists, and then use lapply to apply to every list the function element.name. You can use setNames to avoid problems linked the assignment on names. You can then use list2env to get your data.frames back to the global environment.
setNames(list(list1, list2), c('list1', 'list2')) |>
lapply(function(x) setNames(x, element.name(x))) |>
list2env()
output
> list1
$df1
x y
1 1 a
2 2 b
3 3 c
$df2
x y
1 4 d
2 5 e
3 6 f
> list2
$df3
x y
1 7 g
2 8 h
3 9 i
$df4
x y
1 10 j
2 11 k
3 12 l
Here is an approach using data.table::fread
library(data.table)
# create dummy CSVs -------------------------------------------------------
DT1 <- data.frame(x = 1:3, y = letters[1:3])
DT2 <- data.frame(x = 4:6, y = letters[4:6])
DT3 <- data.frame(x = 7:9, y = letters[7:9])
DT4 <- data.frame(x = 10:12, y = letters[10:12])
mapply(write.csv, x = list(DT1, DT2, DT3, DT4), file = list("DT1.csv", "DT2.csv", "DT3.csv", "DT4.csv"), row.names = FALSE)
# read in CSVs ------------------------------------------------------------
csv_paths <- list.files(path = ".", pattern = ".csv$")
# might need to split this into different steps due to different csv formats?
DT_list <- setNames(lapply(csv_paths, fread), tools::file_path_sans_ext(basename(csv_paths)))
# apply a function to each data.table -------------------------------------
lapply(DT_list, function(DT){DT[, test := x*2]})
If you want to stick with the given dummy data just merge the lists:
list1 <- list(df1,df2)
list2 <- list(df3,df4)
DT_list <- setNames(c(list1, list2), tools::file_path_sans_ext(basename(csv_paths)))

Special reshape in R

Consider a 3x3 char dataframe:
example <- data.frame(one = c("a","b","c"),
two = c("a","b","b"),
three = c ("c","a","b"))
I want to resize these data to 6x2 and add the following content:
desired <- data.frame(one = c("a","a","b","b",
"c","b"),
two = c("a","c","b","a","b","b"))
For the original example dataframe, I want to rbind() the contents of example[,2:3] beneath each row index.
This can be achieved by:
ex <- as.matrix(example)
des <- as.data.frame(rbind(ex[,1:2], ex[,2:3]))
Maybe using library(tidyverse) for an arbitrary number of columns would be nicer?
For each pair of columns, transpose the sub-data.frame defined by them and coerce to vector. Then coerce to data.frame and set the result's names.
The code that follows should be scalable, it does not hard code the number of columns.
desired2 <- as.data.frame(
lapply(seq(names(example))[-1], \(k) c(t(example[(k-1):k])))
)
names(desired2) <- names(example)[-ncol(example)]
identical(desired, desired2)
#[1] TRUE
The code above rewritten as a function.
reformat <- function(x){
y <- as.data.frame(
lapply(seq(names(x))[-1], \(k) c(t(x[(k-1):k])))
)
names(y) <- names(x)[-ncol(x)]
y
}
reformat(example)
example %>% reformat()
Another example, with 6 columns input.
ex1 <- example
ex2 <- example
names(ex2) <- c("fourth", "fifth", "sixth")
ex <- cbind(ex1, ex2)
reformat(ex)
ex %>% reformat()
A tidyverse approach using tidyr::pivot_longer may look like so:
library(dplyr)
library(tidyr)
pivot_longer(example, -one, values_to = "two") %>%
select(-name)
#> # A tibble: 6 × 2
#> one two
#> <chr> <chr>
#> 1 a a
#> 2 a c
#> 3 b b
#> 4 b a
#> 5 c b
#> 6 c b
A base-R solution with Map:
#iterate over example$one, example$two, and example$three at the same
#time, creating the output you need.
mylist <- Map(function(x ,y ,z ) {
data.frame(one = c(x, y), two = c(y, z))
},
example$one #x,
example$two #y,
example$three #z)
do.call(rbind, mylist)
one two
a.1 a a
a.2 a c
b.1 b b
b.2 b a
c.1 c b
c.2 b b

Adding columns based on a specific function with (l)apply or purr on lists in a list?

I want to add columns based on a function in all lists in list.
list1 <- list(A = c(1:10), B = c(rnorm(1:10)), C = c(rnorm(1:10)), D = c(rnorm(1:10)))
list2 <- list(A = c(1:10), B = c(rnorm(1:10)), C = c(rnorm(1:10)), D = c(rnorm(1:10)))
both_lists <- list(list1,list2)
both_lists <- lapply(both_lists, function(x) ... )
For one dataframe (not in a list) I normally use:
df1 <- data.frame(A = c(1:10), B = c(rnorm(1:10)), C = c(rnorm(1:10)), D = c(rnorm(1:10)))
df2 <- data.frame(A = c(1:10), B = c(rnorm(1:10)), C = c(rnorm(1:10)), D = c(rnorm(1:10)))
df1 %>% mutate(max = do.call(pmax, c(select(., c(2:4)))))
But how do I do this for the lists* in the list? So I want to do 2 things to all the lists in my list:
find the maximum of columns 2-4
add that maximum as a separate row
Oh and could anyone also tell me how I actually change the name of the list inside the list? (So changing the name of list1 to the name of a row name in the set? EG setting the name of list to df1[[1]][1] and repeat that with lapply for every list in the list?
With lapply you can do it as follows:
lapply(both_lists, function(x){x[['max']] <- do.call(pmax, x[2:4]); x})
The output looks like this:
[[1]]
[[1]]$A
[1] 1 2 3 4 5 6 7 8 9 10
[[1]]$B
[1] 1.325128799 0.341702207 0.341139152 -0.630065889 0.799934566 0.427531770
[7] -1.492861023 2.643621022 0.008158055 -0.187956774
[[1]]$C
[1] -0.8535937 -0.1753520 1.1008905 -0.0385363 -1.6739434 0.2179597 -0.1300490 0.4177869
[9] 1.3066992 0.2369493
[[1]]$D
[1] 0.98472409 0.66930725 0.52449977 0.08553770 -1.81759549 -0.07564249 -0.63611958
[8] -1.19293507 -1.61571223 1.29777033
[[1]]$max
[1] 1.3251288 0.6693073 1.1008905 0.0855377 0.7999346 0.4275318 -0.1300490 2.6436210
[9] 1.3066992 1.2977703
[[2]]
...
Assuming your data.frames df1 and df2 as shown in the OP are in a list named dfl:
library(dplyr)
library(magrittr)
dfl <- lapply(dfl, function(x){
x %<>% mutate(max = do.call(pmax, c(select(., c(2:4)))))
})
And if you want to set the names of the list elements as some value from the data.frames within, maybe something like this?
names(dfl) <- lapply(dfl, function(x){
x[2,2]
})
I hope this is what you actually meant because your question was a bit unclear to me. (Apologies if I am wrong.)

How to find if a value exists in a range and print "FOUND" or "MISSING" in a new column

I am trying to perform a function simmiliar to the function in excel fount below:
IF(COUNTIF(RANGE, CRITERIA), "FOUND", "MISSING")
I want to print a new column in my dataframe with found or missing. I understand in R that I can use %in% for example:
A$C %in C$B
To find if the values in column C of the A dataframe exist in the values in column B of the C datafame. However, I do not know how to subset said results with a conditional function to print found or missing to a new column in the correct row.
Here is an example of the dataframes:
A <- data.frame("C" = c(3,5,9,21,25), "D" = 1:5)
C <- data.frame("B" = c(3,6,21,22,8) , "F" = 10:14)
A$C %in% C$B
A[A$C %in% C$B,]
Based on the limited information:
lookup_list <- c(1:3)
x <- c('a','b','c')
y <- c(10, 3, 5)
df <- data.frame(x,y)
x y
1 a 10
2 b 3
3 c 5
df <- df %>%
mutate(status = case_when(
y %in% lookup_list ~ 'FOUND',
!y %in% lookup_list ~ 'MISSING'
))
x y status
1 a 10 MISSING
2 b 3 FOUND
3 c 5 MISSING

How to Add Dataframe name to Columns from Multiple Dataframes

The goal is to rename a list of dataframes columns, but while adding the dataframe name to the new column name.
ex: from x to a_x and b_x.
Why? Because I plan to later merge the sets and would like clean ids for the columns.
a = data.frame(x = c(1,2))
b = data.frame(x = c(3,4))
frameList = list(a = a, b = b)
newName = c(*frameName+'_'+'x')
names = lapply(names, setNames, nm = newName)
list2env(names,.GlobalEnv)
Here is one way for you. I looped through each data frame in frameList using the length of frameList. For column names in each data frame, I took the name of a data frame (i.e., names(frameList)) and past it to column names in the data frame.
a = data.frame(x = c(1,2), y = 1:2)
b = data.frame(x = c(3,4), y = 1:2)
frameList = list(a = a, b = b)
lapply(1:length(names(frameList)), function(x) {
names(frameList[[x]]) <- paste(names(frameList)[x], names(frameList[[x]]), sep = "_")
return(frameList[[x]])
})
[[1]]
a_x a_y
1 1 1
2 2 2
[[2]]
b_x b_y
1 3 1
2 4 2
Or another option is Map
Map(function(x, y) setNames(x, paste(y, names(x), sep="_")), frameList, names(frameList))
#$a
# a_x a_y
#1 1 1
#2 2 2
#$b
# b_x b_y
#1 3 1
#2 4 2
Or with tidyverse
library(purrr)
library(dplyr)
f1 <- function(x, y) paste(y, x, sep="_")
map2(frameList, names(frameList), ~ .x %>%
rename_all(f1, .y))
If we need it in the reverse order, this is more simple
map2(frameList, names(frameList), ~ .x %>%
rename_all(paste, sep="_", .y))

Resources