Sample groups and preserve row order - r

I have a dataframe such as:
df <- data.frame(id = factor(c(12321,12321,12321,4445,4445,4445,4445,787,787,787)),
word = c("please", "stop", "that", "the", "fox", "jumps", "that", "please", "eat", "noodles"),
word_id = c(12,5,28,99,214,800,28,12,78,912))
And I am attempting to take a sample of the data frame while preserving the id group and the word and word_id order.
I tried newDF <- df %>% group_by(id) %>% sample_frac(0.33) but this takes a sample of each group.
I would like to result in a dataframe that takes a sample of all id groups in the original dataframe and preserves the order of the columns. So if I want to take a 33% sample of df I will end up with 33% of the id groups and the columns remain in order.
newDF <- data.frame(id = factor(c(12321,12321,12321,4445,4445,4445,4445)),
word = c("please", "stop", "that", "the", "fox", "jumps", "that"),
word_id = c(12,5,28,99,214,800,28))

Adding to alistaire's comment:
library(dplyr)
library(tidyr)
newDF1 <- df %>%
group_by(id) %>%
nest() %>%
sample_frac(1/3) %>%
unnest()
newDF2 <- anti_join(df, newDF1, by = "id")

Related

How do I add row values to colnames in R

I have a dataframe and I would like to add the first row to the names of the columns
What I have:
col1
col2
col3
city
state
country
...
...
...
What I want:
col1_city
col2_state
col3_country
city
state
country
...
...
...
I can't do it manually because there are many cols in the df
I think of something like
df %>% rename_with(~ names(.) %>%
map_chr(~glue('{.x}_.[1,])))
Thanks!!
With rename_with
df %>%
rename_with(.cols = everything(),
.fn = ~paste0(colnames(df), '_', df[1,]))
Update: Here's a solution where you can pass the current data as it is created/altered within a pipe:
df |>
(\(x) (x <- x |>
rename_with(.cols = everything(),
.fn = ~paste0(colnames(x), '_', x[1,]))))()
So here you could, for example, do some filtering before the renaming or some mutating or whatever you want.
In base R, just do
names(df) <- paste0(names(df), "_", unlist(df[1,]))
-output
> df
col1_city col2_state col3_country
1 city state country
Or with dplyr
library(dplyr)
library(stringr)
df %>%
set_names(str_c(names(.), '_', slice(., 1)))
-output
col1_city col2_state col3_country
1 city state country
data
df <- structure(list(col1 = "city", col2 = "state",
col3 = "country"), class = "data.frame", row.names = c(NA,
-1L))

Pivot wider in R with multiple columns

I am having trouble converting a particular dataset from long to wide.
col1 col2
ID 55.
animal. dog
animal bear
animal rabbit
shape. circle
ID 67.
animal. cat
shape. square
As you can see, some IDs have multiple observations for "animal" and so I want to make multiple columns like this:
ID. animal. animal2 animal3 shape
55. dog bear. rabbit circle
67. cat. NA NA square
Any help is appreciated!
Try this solution.
Most of the work was creating an separate ID column and then creating the unique names for the columns.
library(tidyr)
library(dplyr)
library(vctrs)
df<- structure(list(col1 = c("ID", "animal", "animal", "animal", "shape", "ID", "animal", "shape"),
col2 = c("55.", "dog", "bear", "rabbit", "circle", "67.", "cat", "square")),
class = "data.frame", row.names = c(NA, -8L))
#create the ID column
df$ID <- NA
#find the ID rows
idrows <- which(df$col1 == "ID")
#fill column and delete rows
df$ID[idrows] <- df$col2[idrows]
df <- fill(df, ID, .direction = "down")
df <- df[-idrows, ]
#create unique names in each grouping and the pivot wider
df %>% group_by(ID) %>%
mutate(col1=vec_as_names(col1, repair = "unique")) %>%
mutate(col1=stringr::str_replace( col1, "\\.+1", "")) %>%
ungroup() %>%
pivot_wider(id_cols = "ID", names_from = "col1", values_from = "col2")
ID animal animal...2 animal...3 shape
<chr> <chr> <chr> <chr> <chr>
1 55. dog bear rabbit circle
2 67. cat NA NA square
Another alternatives based on one of your previous questions:
df %>% group_by(ID) %>%
mutate(col1 = paste0(col1, data.table::rowid(col1))) %>%
ungroup() %>%
pivot_wider(id_cols = "ID", names_from = "col1", values_from = "col2")
or
df %>%
pivot_wider(id_cols = "ID", names_from = "col1", values_from = "col2") %>%
unnest_wider( "shape", names_sep = "_") %>% unnest_wider( "animal", names_sep = "_")

Split row elements by character and transform into a vector

I have a dataframe looking like:
library(tidyverse)
df <- tibble::tribble(
~respondent, ~selection,
1, "Brain/Energy/Sleep",
2, "Energy/Mood/Sex",
3, "Detox/Sex/Stress"
)
I want to count the unique elements in each row, after splitting them at each '/', hence transforming the column selection as:
selection <- c("selection", "Brain", "Energy", "Sleep", "Energy", "Mood", "Sex", "Detox", "Sex", "Stress")
How to perform this using dplyr?
new_vec <-
df %>%
pull(selection) %>%
strsplit("/") %>%
unlist() %>%
c("selection", .)
new_vec
[1] "selection" "Brain" "Energy" "Sleep" "Energy" "Mood" "Sex"
[8] "Detox" "Sex" "Stress"
Here is another solution that eliminates duplicate elements in each row. I added a duplicate element in the df to exemplify this.
#Your data with a duplicate element
df <- tibble::tribble(
~respondent, ~selection,
1, "Brain/Energy/Sleep/Sleep",
2, "Energy/Mood/Sex/Energy",
3, "Detox/Sex/Stress/Detox"
)
#Number of columns expected after splitting each row on "/"
ncols_exp<-4
#Getting the distinct values per row (respondent)
df %>%
#Separate each entry in selection as multiple columns
separate(col = selection,
into = paste0("Var", 1:ncols_exp),
sep = "/") %>%
#Transform data into long format
pivot_longer(cols = starts_with("Var"),
names_to = "Var",
values_to = "Val") %>%
#Group by "respondent" (each row in the original df)
group_by(respondent) %>%
#Get unique elements of the Val column
distinct(Val) %>%
#Pull Val column
pull(Val) %>%
#Concatenate the unique values with "selection" as the first entry
c("selection", .)

Calculation on every pair from grouped data.frame

My question is about performing a calculation between each pair of groups in a data.frame, I'd like it to be more vectorized.
I have a data.frame that has a consists of the following columns: Location , Sample , Var1, and Var2. I'd like to find the closet match for each Sample for each pair of Locations for both Var1 and Var2.
I can accomplish this for one pair of locations as such:
df0 <- data.frame(Location = rep(c("A", "B", "C"), each =30),
Sample = rep(c(1:30), times =3),
Var1 = sample(1:25, 90, replace =T),
Var2 = sample(1:25, 90, replace=T))
df00 <- data.frame(Location = rep(c("A", "B", "C"), each =30),
Sample = rep(c(31:60), times =3),
Var1 = sample(1:100, 90, replace =T),
Var2 = sample(1:100, 90, replace=T))
df000 <- rbind(df0, df00)
df <- sample_n(df000, 100) # data
dfl <- df %>% gather(VAR, value, 3:4)
df1 <- dfl %>% filter(Location == "A")
df2 <- dfl %>% filter(Location == "B")
df3 <- merge(df1, df2, by = c("VAR"), all.x = TRUE, allow.cartesian=TRUE)
df3 <- df3 %>% mutate(DIFF = abs(value.x-value.y))
result <- df3 %>% group_by(VAR, Sample.x) %>% top_n(-1, DIFF)
I tried other possibilities such as using dplyr::spread but could not avoid the "Error: Duplicate identifiers for rows" or columns half filled with NA.
Is there a more clean and automated way to do this for each possible group pair? I'd like to avoid the manual subset and merge routine for each pair.
One option would be to create the pairwise combination of 'Location' with combn and then do the other steps as in the OP's code
library(tidyverse)
df %>%
# get the unique elements of Location
distinct(Location) %>%
# pull the column as a vector
pull %>%
# it is factor, so convert it to character
as.character %>%
# get the pairwise combinations in a list
combn(m = 2, simplify = FALSE) %>%
# loop through the list with map and do the full_join
# with the long format data df1
map(~ full_join(df1 %>%
filter(Location == first(.x)),
df1 %>%
filter(Location == last(.x)), by = "VAR") %>%
# create a column of absolute difference
mutate(DIFF = abs(value.x - value.y)) %>%
# grouped by VAR, Sample.x
group_by(VAR, Sample.x) %>%
# apply the top_n with wt as DIFF
top_n(-1, DIFF))
Also, as the OP mentioned about automatically picking up instead of doing double filter (not clear about the expected output though)
df %>%
distinct(Location) %>%
pull %>%
as.character %>%
combn(m = 2, simplify = FALSE) %>%
map(~ df1 %>%
# change here i.e. filter both the Locations
filter(Location %in% .x) %>%
# spread it to wide format
spread(Location, value, fill = 0) %>%
# create the DIFF column by taking the differene
mutate(DIFF = abs(!! rlang::sym(first(.x)) -
!! rlang::sym(last(.x)))) %>%
group_by(VAR, Sample) %>%
top_n(-1, DIFF))

removing groups with a certain NA number

Sorry to bother with a relatively simple question perhaps.
I have this type of dataframe:
A long list of names in the column "NAME" c(a, b, c, d, e ...) , two potential classes in the column "SURNAME" c(A, B) and a third column containing values.
I want to remove all NAMES for which at least in one of the SURNAME classes I have more than 2 "NA" in the VALUE column.
I wanted to post an example dataset but I am struggling to format it properly
I was trying to use
df <- df %>%
group_by(NAME) %>%
group_by(SURNAME) %>%
filter(!is.na(VALUE)) %>%
filter(length(VALUE)>=3)
it does not throw an error but I have the impression that something is wrong. Any suggestion? Many thanks
Let's create a dataset to work with:
set.seed(1234)
df <- data.frame(
name = sample(x=letters, size=1e3, replace=TRUE),
surname = sample(x=c("A", "B"), size=1e3, replace=TRUE),
value = sample(x=c(1:10*10,NA), size=1e3, replace=TRUE),
stringsAsFactors = FALSE
)
Here's how to do it with Base R:
# count NAs by name-surname combos (na.action arg is important!)
agg <- aggregate(value ~ name + surname, data=df, FUN=function(x) sum(is.na(x)), na.action=NULL)
# rename is count of NAs column
names(agg)[3] <- "number_of_na"
#add count of NAs back to original data
df <- merge(df, agg, by=c("name", "surname"))
# subset the original data
result <- df[df$number_of_na < 3, ]
Here's how to do it with data.table:
library(data.table)
dt <- as.data.table(df)
dt[ , number_of_na := sum(is.na(value)), by=.(name, surname)]
result <- dt[number_of_na < 3]
Here's how to do it with dplr/tidyverse:
library(dplyr) # or library(tidyverse)
result <- df %>%
group_by(name, surname) %>%
summarize(number_of_na = sum(is.na(value))) %>%
right_join(df, by=c("name", "surname")) %>%
filter(number_of_na < 3)
After grouping by 'NAME', 'SURNAME', create a column with the number of NA elements in that group and then filter out any 'NAME' that have an 'ind' greater than or equal to 3
df %>%
group_by(NAME, SURNAME) %>%
mutate(ind = sum(is.na(VALUE))) %>%
group_by(NAME) %>%
filter(!any(ind >=3)) %>%
select(-ind)
Or do an anti_join after doing the filtering by 'NAME', 'SURNAME' based on the condition
df %>%
group_by(NAME, SURNAME) %>%
filter(sum(is.na(VALUE))>=3) %>%
ungroup %>%
distinct(NAME) %>%
anti_join(df, .)
data
set.seed(24)
df <- data.frame(NAME = rep(letters[1:5], each = 20),
SURNAME = sample(LETTERS[1:4], 5 * 20, replace = TRUE),
VALUE = sample(c(NA, 1:3), 5 *20, replace = TRUE),
stringsAsFactors = FALSE)

Resources