I find purrr SUPER useful for making results tables for a bunch of different variables. I was wondering if there was a way for the unnest() function (or otherwise) to expand a high order variable into blanks, rather than just repeating.
For example, with this code:
library(tidyverse)
data <- data.frame(
group1 = sample(c('dog','cat', 'gecko'), 100, replace = T),
group2 = sample(c('hot dog', 'not hotdog', 'other'), 100, replace = T)
)
my_freq <- function(var){
result <- as.data.frame(table(data[[var]]))
colnames(result) <- c('level', 'n')
return(result)
}
the_table <- data.frame(var = c('group1', 'group2'))
the_table <- the_table %>%
mutate(
result = map(var, my_freq)
) %>%
unnest(result)
Instead of the resulting table looking like:
It would look like this:
I guess this would be a multi-level index in python, but not sure how to accomplish in r.
Extending rmagno's solution to other high order variables
...%>%
mutate_at(
.vars = vars(high_order_vars),
.funs = function(x) ifelse(duplicated(.[['var']]), NA, x)
)
Not sure what you mean by blank (in this example I am going with NA). The critical line is:
mutate(var = if_else(!duplicated(var), var, NA_integer_))
Minimal working example:
library(tidyverse)
data <- data.frame(
group1 = sample(c('dog','cat', 'gecko'), 100, replace = T),
group2 = sample(c('hot dog', 'not hotdog', 'other'), 100, replace = T)
)
my_freq <- function(var){
result <- as.data.frame(table(data[[var]]))
colnames(result) <- c('level', 'n')
return(result)
}
the_table <- data.frame(var = c('group1', 'group2'))
the_table <- the_table %>%
mutate(
result = map(var, my_freq)
) %>%
unnest(result) %>%
mutate(var = if_else(!duplicated(var), var, NA_integer_))
the_table
#> # A tibble: 6 x 3
#> var level n
#> <fct> <fct> <int>
#> 1 group1 cat 38
#> 2 <NA> dog 38
#> 3 <NA> gecko 24
#> 4 group2 hot dog 36
#> 5 <NA> not hotdog 34
#> 6 <NA> other 30
Created on 2020-02-29 by the reprex package (v0.3.0)
You could just use an lapply to get a list I call tb of the two tables. Then create a matrix with names(tb) in the first row and the rest blanks and convert it to a data frame. Finally Map assigns the desired names applying cbind on the columns of both data frames consecutively.
tb <- lapply(data, function(x) setNames(as.data.frame(table(x)), c("level", "n")))
res <- do.call(rbind,
Map(cbind,
var=data.frame(
matrix(c(names(tb), rep("", (el(lapply(tb, nrow)) - 1)*2)),
ncol=2, byrow=TRUE)),
tb))
res
# var level n
# X1.1 group1 cat 31
# X1.2 dog 26
# X1.3 gecko 43
# X2.1 group2 hot dog 35
# X2.2 not hotdog 37
# X2.3 other 28
Related
I have 2 dataframes like the following:
df1
colA
A
B
C
D
df2
one two
x A
y A;B
z A;D;C
p E
q F
I want to filter df2 for entries contained in df1. i.e "two" containing values of colA, so that my output will be
one two
x A
y A;B
z A;D;C
I tried all these options that didn't work
df2filtered = df2 %>% filter(two %in% df1$colA)
df2filtered = df2 %>% filter(two %in% str_detect(df1$colA))
df2filtered = df2 %>% select(two, contains(df1$colA))
str_detect with character works but not when given in df like above. What is the right solution?
Here's one way to obtaning the desired output using map to create an extra column to afterwards apply the filter.
library(tidyverse)
df2 %>%
# Use map to check if any string in df1$colA is found in
# df2$two; then use any to check if any entry is T
mutate(stay = map(two, function(x){
any(str_detect(x,df1$colA))
})) %>%
# Filter
filter(stay == T) %>%
# Remove extra column
select(-c(stay))
# one two
#1 x A
#2 y A;B
#3 z A;D;C
Your data is not "tidy". I'd reshape it into a long format. Then, filtering becomes easy.
Below an approach which makes use of an non-exported function of the eye package in order to split the column into an unknown number of columns. (disclaimer: I am the author of this package. The function was inspired and modified from this answer). Then pivot the result longer and filter by the presence in df1$colA. I'd leave the result in a tidy format, but you can of course melt it back to your rather messy shape.
library(tidyverse)
df1 <- read.table(text = "colA
A
B
C
D", header = TRUE)
df2 <- read.table(text = "one two
x A
y A;B
z A;D;C
p E
q F ", header = TRUE)
#install.packages("eye")
eye:::split_mult(df2, "two", pattern = ";" ) %>%
pivot_longer(cols = starts_with("var"), names_to = "var", values_to = "val") %>%
drop_na(val)%>%
select(-var) %>%
group_by(one) %>%
filter(any(val %in% df1$colA))
#> # A tibble: 6 x 2
#> # Groups: one [3]
#> one val
#> <chr> <chr>
#> 1 x A
#> 2 y A
#> 3 y B
#> 4 z A
#> 5 z D
#> 6 z C
Created on 2021-07-14 by the reprex package (v2.0.0)
because this function might change in the future, here for future reference:
split_mult <- function (x, col, pattern = "_", into = NULL, prefix = "var",
sep = "")
{
cols <- stringr::str_split_fixed(x[[col]], pattern, n = Inf)
cols[which(cols == "")] <- NA_character_
m <- dim(cols)[2]
if (length(into) == m) {
colnames(cols) <- into
}
else {
colnames(cols) <- paste(prefix, 1:m, sep = sep)
}
cbind(cols, x[names(x) != col])
}
Another option using str_detect. You can collapse df1$colA so that str_detect searches for A or B or C or D. e.g. "A|B|C|D".
library(tidyverse)
df2 %>% filter(str_detect(two, paste(df1$colA, collapse = '|')))
#> one two
#> 1 x A
#> 2 y A;B
#> 3 z A;D;C
I have data that were collected from a year but are broken up by months. For my code, I labeled them df1-df12 for each corresponding month. I am trying to group these data using the group_by function to group all the dataframes similarly. When I do the following code- it works fine alone:
df <- df %>%
group_by(date,id) %>%
slice(n()) %>%
ungroup()
However, I would like to streamline this code so that I can use this function for all 12 dataframes without having to copy/paste 12 times, since there is a lot of data to go through. Here is what I have tried to do to that end:
func1<-function(df)
{
df <- df %>%
group_by(date,id) %>%
slice(n()) %>%
ungroup()
}
yr19<-c(df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12)
map(yr19, func1)
However, i get the following error message: Error in UseMethod("group_by") :
no applicable method for 'group_by' applied to an object of class "character". As stated above- i don't get this error message if I go through and do it individually, but there are many months and many years to be analyzed and from a time perspective I don't think doing this code manually is feasible. Thanks for your help
Two ways you can approach this, first using the approach suggested by #ktiu:
## Create example data
library(dplyr) # for pipe and group_by()
set.seed(914)
df1 <- tibble(
date = sample(1:30, 50, replace = T),
id = sample(1:10, 50, replace = T),
var1 = rnorm(50, mean = 10, sd = 3)
)
df2 <- tibble(
date = sample(1:30, 50, replace = T),
id = sample(1:10, 50, replace = T),
var1 = rnorm(50, mean = 10, sd = 3)
)
Modifying your function to address error
func1<-function(df)
{
df <- df %>%
group_by(date,id) %>%
slice(n()) %>%
ungroup()
df
}
## And using list rather than c to combine data frames.
yr19 <- list(df1, df2)
yr19_data <- lapply(yr19, func1)
# This will return a list of data frames you can access with `yr19_data[[1]]`
Alternative approach is to add variable for your source data frames, then collapse it all into a single data frame and manipulate from there. Which approach makes more sense will depend on what else you want to do later.
func2 <- function(df.name){
mutate(get(df.name), source = df.name)
}
# This is set up to get objects given their names, so we'll use a character vector
# of names to iterate off of.
yr19 = c("df1", "df2")
df.list <- lapply(yr19, func2)
df.long <- do.call(bind_rows, df.list)
df.long
# # A tibble: 100 x 4
# date id var1 source
# <int> <int> <dbl> <chr>
# 1 27 9 9.31 df1
# 2 5 3 16.5 df1
# 3 28 3 2.67 df1
# 4 24 4 8.94 df1
# 5 13 3 1.68 df1
At this point you can manipulate one data frame in your original pipe:
df <- df.long %>%
group_by(source, date,id) %>%
slice(n()) %>%
ungroup()
df
# # A tibble: 93 x 4
# date id var1 source
# <int> <int> <dbl> <chr>
# 1 1 8 9.89 df1
# 2 2 4 10.9 df1
# 3 4 3 8.45 df1
# 4 5 3 16.5 df1
# 5 5 7 10.6 df1
Sample data
dat <-
data.frame(Sim.Y1 = rnorm(10), Sim.Y2 = rnorm(10),
Sim.Y3 = rnorm(10), obsY = rnorm(10),
ID = sample(1:10, 10), ID_s = rep(1:2, each = 5))
For the following vector, I want to calculate the mean across ID_s
simVec <- c('Sim.Y1.cor','Sim.Y2.cor')
for(s in simVec){
simRef <- simVec[s]
simID <- unlist(strsplit(simRef, split = '.cor',fixed = T))[1]
# this works
dat %>% dplyr::group_by(ID_s) %>%
dplyr::summarise(meanMod = mean(Sim.Y1))
# this doesn't work
dat %>% dplyr::group_by(ID_s) %>%
dplyr::summarise(meanMod = mean(!!(simID)))
}
How do I refer a column in dplyr not by its explicit name?
Note that your particular task can be performed without any non-standard evaluation by using summarize_at(), which works directly with strings:
simIDs <- stringr::str_split(simVec, ".cor") %>% purrr::map_chr(1)
# [1] "Sim.Y1" "Sim.Y2"
dat %>% dplyr::group_by(ID_s) %>% dplyr::summarise_at(simIDs, mean)
# # A tibble: 2 x 3
# ID_s Sim.Y1 Sim.Y2
# <int> <dbl> <dbl>
# 1 1 0.494 -0.0522
# 2 2 -0.104 -0.370
A custom suffix can also be supplied through the named list:
dat %>% dplyr::group_by(ID_s) %>% dplyr::summarise_at(simIDs, list(m=mean))
# # A tibble: 2 x 3
# ID_s Sim.Y1_m Sim.Y2_m <--- Note the _m suffix
# <int> <dbl> <dbl>
# 1 1 0.494 -0.0522
# 2 2 -0.104 -0.370
First, you have to use seq_along() if you want to index you vector with s.
Second, you are missing sym().
This should work:
simVec <- c('Sim.Y1.cor','Sim.Y3.cor')
for(s in seq_along(simVec)){
simRef <- simVec[s]
simID <- unlist(strsplit(simRef, split = '.cor',fixed = T))[1]
# this works
dat %>% dplyr::group_by(ID_s) %>%
dplyr::summarise(meanMod = mean(Sim.Y1))
# this doesn't work
dat %>% dplyr::group_by(ID_s) %>%
dplyr::summarise(meanMod = mean(!!sym(simID)))
}
edit: no Typo
Try this
library(dplyr)
dat %>% group_by(ID) %>%
summarise(mean_y1 =mean(Sim.Y1),
mean_y2 =mean(Sim.Y2),
mean_y3 =mean(Sim.Y3),
mean_obsY = mean(obsY))
I understand the question to be, how do you get a column without referencing the column name, i.e. using the index instead.
Let me know if my understanding is incorrect.
If not, I believe the easiest way would be as per below.
> df1 <- data.frame(ID_s=c('a','b','c'),Val=c('a1','b1','c1'))
> df1
ID_s Val
1 a a1
2 b b1
3 c c1
> df1[,1]
[1] a b c
Levels: a b c
If you want to save that as a dataframe, can be extended as per below:
cc <- data.frame(ID_s=df1[,1])
Hope this helps!
I am in trouble with making multi filtering in a dataframe wrt the list of data. My real data set is huge, so I created a fake one as below to make the question replicable.
set.seed(1)
df <- data.frame(Cluster=round(runif(2000,1,50)),
Grup = paste0("Group",round(runif(2000,1,10))),
ID = paste0("id",1:2000),
Point1 = round(runif(2000,1,100)),
Point2 = round(runif(2000,1,100)))
Cluster_grup <- list(List1 = data.frame( V1=c(47,35),V2=c(20,35)),
List2 = data.frame(V1=c(10,5,6),V2=c(49,2,46),V3=c(11,12,13)),
List3 = data.frame(V1=c(22,3),V2=c(18,18),V3=c(50,25),V4=c(6,7)))
Grup_info <- list(First = c("Group1","Group7"),
Second = c("Group4","Group5","Group3"),
Third = c("Group10","Group8","Group1","Group6"))
I basically want to make a filtering with respect to the data inside Grup_info and Cluster_grup. For instance if we take the first elements of those two lists,
Grup_info[[1]]
"Group1" "Group7"
Cluster_grup [[1]]
V1 V2
1 47 20
2 35 35
Then I need to filter and apply expand.grid like,
df_sorted1 <- df %>% filter(.,Cluster == 47 & Grup=="Group1") %>%
select(.,ID,Point1,Point2)
df_sorted2 <-df %>% filter(.,Cluster == 20 & Grup=="Group7") %>%
select(.,ID,Point1,Point2)
ep1 <- expand.grid(df_sorted1$ID,df_sorted2$ID)
ep2 <- expand.grid(df_sorted1$Point1,df_sorted2$Point1)
ep3 <- expand.grid(df_sorted1$Point2,df_sorted2$Point2)
data.frame(ep1, SumPoint1 = rowSums(ep2),SumPoint2 = rowSums(ep3))
So the very same thing will be applied while assigning Cluster == 35 inside the filter function. Then I will bind those two dataframes as well.
But as you can see, the length of the groups are not equal. For example the third Grup_info has four elements inside it as the third Cluster_grup does.
At the end, I want to get a list, including three dataframes which are the binded dataframes of expand.grid outputs.
I can actually achieve it by for loops or sapply family functions maybe, but I wonder if there exists a faster solution like a tidyverse approach or something like that.
Nice to see you, maydin
I made the code you want probably.
Data Input
set.seed(1)
library(dplyr)
library(tidyverse)
library(rlang)
library(data.table)
df <- data.frame(Cluster=round(runif(2000,1,50)),
Grup = paste0("Group",round(runif(2000,1,10))),
ID = paste0("id",1:2000),
Point1 = round(runif(2000,1,100)),
Point2 = round(runif(2000,1,100)))
Cluster_grup <- list(List1 = data.frame( V1=c(47,35),V2=c(20,35)),
List2 = data.frame(V1=c(10,5,6),V2=c(49,2,46),V3=c(11,12,13)),
List3 = data.frame(V1=c(22,3),V2=c(18,18),V3=c(50,25),V4=c(6,7)))
Grup_info <- list(List1 = c("Group1","Group7"),
List2 = c("Group4","Group5","Group3"),
List3 = c("Group10","Group8","Group1","Group6"))
Data merge
I merged Cluster_grup and Grup_info.
mergeGrp <-
sapply(names(Grup_info), function(x){
material <- Cluster_grup[[ x ]]
colnames(material)<- Grup_info[[x]]
return(material)
})
> mergeGrp
$List1
Group1 Group7
1 47 20
2 35 35
$List2
Group4 Group5 Group3
1 10 49 11
2 5 2 12
3 6 46 13
$List3
Group10 Group8 Group1 Group6
1 22 18 50 6
2 3 18 25 7
Data handling
I used RbindList to merge all the result.
But if you don't want to that, you should manipulate yourself.
FinalResult = lapply(mergeGrp,function(x){
tidyTest = x %>% tidyr::gather() %>% dplyr::group_by(key)
result = NULL
for (i in 1: NROW(x)){
mate = tidyTest %>% filter(row_number() == i )
condList = apply(mate,1,function(x){
sprintf("( Cluster == %s & Grup == '%s' )",x[2],x[1])
})
filtered = lapply(condList, function(x){
df %>% filter_(x) %>% select(ID,Point1,Point2)}
)
ep1 = filtered %>% purrr::map(.,~.$ID) %>%
as.vector() %>% expand.grid()
ep2 = filtered %>% purrr::map(.,~.$Point1) %>% as.vector() %>%
expand.grid() %>% rowSums()
ep3 = filtered %>% purrr::map(.,~.$Point2) %>% as.vector() %>%
expand.grid() %>% rowSums()
result = rbind(result,data.frame(ep1, SumPoint1 = ep2,SumPoint2 = ep3))
}
return(result)
}
)
#rbindlist(FinalResult)
I've a list of 140 elements of type data.frame ('my.list'). I would like to compute 350 averages of certain values ranges in a certain column for a certain set of rows in a certain data.frame (this is a bit cryptic); so, 350 different averages like:
Of data.frame #1, the average of column 'Measure1', row 1:5;
Of data.frame #2, the average of column 'Measure3', row 1:4, etc. etc.
I have another data.frame ('my.dfAverage') which indicates for which data.frame, column and rows it needs the average. I want to write the 350 different averages and standard deviations to this data.frame (so with the columns: 'average_id', 'dataframe_number', 'column_name', 'row_numbers', 'average' and 'st_dev'). Some value ranges have NA's, these values can be dropped for computing the average.
What is the best way to automatically compute the 350 averages and standard deviations from the list of data.frames based on the info in this data.frame? I thought of creating a for-loop (or maybe the lapply function?), but I'm quite new to these functions, so I'm not sure what the way to go is here.
Small reproducible example of my list of data.frames:
my.df1 <- data.frame(ID = c(1:5),
Measure1 = c(2247,2247,1970,1964,1971),
Measure2 = c(2247,2247,NA,1964,1971))
my.df2 <- data.frame(ID = c(1:4),
Measure3 = c(2247,NA,1970,1964),
Measure5 = c(2247,2247,NA,1964))
my.df3 <- data.frame(ID = c(1:4),
Measure6 = c(2247,600,1970,1964),
Measure8 = c(2247,2247,NA,1964))
my.list <- list(list1 = my.df1, list2 = my.df2, list3 = my.df3)
Desired output table for the averages and standard deviation:
my.dfAverage <- data.frame(average_id = c(1:3),
dataframe_number = c(1,2,3),
column_name = c('Measure1','Measure3','Measure6'),
row_numbers = c('1:3','1:4','1:2'),
average = (NA),
st_dev = (NA))
This is a different approach than the one given above: I will use only base r functions: Point to note, ensure the data has stringsAsFactors=FALSE
write a function but ensure you index mylist correctly. then compute the function on this i e f(...,na.rm=T). to write a function using apply:
fun1=function(f){with(my.dfAverage,
mapply(function(x,y,z)
f(x[eval(parse(text=y)),z],na.rm=T),my.list,row_numbers,column_name))}
transform(my.dfAverage,average=fun1(mean),st_dev=fun1(sd))
average_id dataframe_number column_name row_numbers average st_dev
1 1 1 Measure1 1:3 2154.667 159.9260
2 2 2 Measure3 1:4 2060.333 161.6859
3 3 3 Measure6 1:2 1423.500 1164.6049
Data Used:
my.dfAverage <- data.frame(average_id = c(1:3),
dataframe_number = c(1,2,3),
column_name = c('Measure1','Measure3','Measure6'),
row_numbers = c('1:3','1:4','1:2'),
average = (NA),
st_dev = (NA),stringsAsFactors = F)
A solution using tidyverse.
First, expand the my.dfAverage based on row_numbers.
library(tidyverse)
my.dfAverage2 <- my.dfAverage %>%
separate(row_numbers, into = c("start", "end")) %>%
mutate(row_numbers = map2(start, end, `:`)) %>%
unnest() %>%
select(-start, -end) %>%
mutate(row_numbers = as.integer(row_numbers),
dataframe_number = as.integer(dataframe_number))
Second, transform all data frames in my.list and combine them to a single data frame.
my.list.df <- my.list %>%
setNames(1:length(.)) %>%
map_dfr(function(x){
x2 <- x %>%
gather(column_name, value, -ID)
return(x2)
},.id = "dataframe_number") %>%
mutate(ID = as.integer(ID), dataframe_number = as.integer(dataframe_number)) %>%
rename(row_numbers = ID)
Third, merge my.dfAverage2 and my.list.df and calculate the mean and standard deviation. my.dfAverage3 is the final output.
my.dfAverage3 <- my.dfAverage2 %>%
left_join(my.list.df, by = c("dataframe_number", "column_name", "row_numbers")) %>%
group_by(average_id, dataframe_number, column_name) %>%
summarise(row_numbers = paste(min(row_numbers), max(row_numbers), sep = ":"),
average = mean(value, na.rm = TRUE),
st_dev = sd(value, na.rm = TRUE)) %>%
ungroup()
my.dfAverage3
# A tibble: 3 x 6
# average_id dataframe_number column_name row_numbers average st_dev
# <int> <int> <chr> <chr> <dbl> <dbl>
# 1 1 1 Measure1 1:3 2155 160
# 2 2 2 Measure3 1:4 2060 162
# 3 3 3 Measure6 1:2 1424 1165
DATA
my.list is the same as OP's my.list.
my.dfAverage <- data.frame(average_id = c(1:3),
dataframe_number = c(1,2,3),
column_name = c('Measure1','Measure3','Measure6'),
row_numbers = c('1:3','1:4','1:2'))