Find number of variables on different levels within a dataframe using dplyr? - r

I have the following code and am unsure how this would be written using dplyr
df <- data.frame(
a = c(1, 1, 1, 2, 2, 2, 2, 2),
b = c(1, 2, 3, 2, 3, 2, 3, 2),
c = c(1, 2, 3, 4, 3, 4, 3, 4),
d = c(1, 2, 3, 4, 5, 4, 5, 4),
e = c(1, 2, 3, 2, 3, 4, 3, 5)
)
n = 100
results=data.frame(levels=double(),amount=double())
for(i in 1:n){
r <- df %>% select_if(~n_distinct(.)==i)
if(dim(r)[2]>0){
results=rbind(results,data.frame(levels=i,amount=dim(r)[2]))
}
}
results
which outputs
levels amount
1 2 1
2 3 1
3 4 1
4 5 2
The use of the for loop and if statement makes me think there must be a
nicer approach though, or at least, one that makes use of dplyr instead.
edit
Data frame with different types
df <- data.frame(
a = c(1, 1, 1, 2, 2, 2, 2, 2),
b = c(1, 2, 3, 2, 3, 2, 3, 2),
c = c(1, 2, 3, 4, 3, 4, 3, 4),
d = c(1, 2, 3, 4, 5, 4, 5, 4),
e = c(1, 2, 3, 2, 3, 4, 3, 5),
f = c('a','b','a','a','a','a','a','b')
)

One dplyr and tidyr possibility could be:
df %>%
pivot_longer(everything()) %>%
group_by(name) %>%
summarise(n_levels = n_distinct(value)) %>%
ungroup() %>%
count(n_levels)
n_levels n
<int> <int>
1 2 1
2 3 1
3 4 1
4 5 2

library(dplyr)
library(tidyr)
df %>%
summarise_all(.funs = function(x) length(unique(x))) %>%
pivot_longer(everything()) %>% #OR gather %>%
count(value)

A base R approach could be :
stack(table(sapply(df, function(x) length(unique(x)))))
# ind values
#1 2 1
#2 3 1
#3 4 1
#4 5 2

I think this is a better way to do what you want. Using dplyr and purr.
library(tidyverse)
df <- data.frame(
a = c(1, 1, 1, 2, 2, 2, 2, 2),
b = c(1, 2, 3, 2, 3, 2, 3, 2),
c = c(1, 2, 3, 4, 3, 4, 3, 4),
d = c(1, 2, 3, 4, 5, 4, 5, 4),
e = c(1, 2, 3, 2, 3, 4, 3, 5)
)
map_df(df, function(d){
data.frame(level = n_distinct(d))
}) %>%
group_by(level) %>%
summarise(amount = n())

Related

Add next row's value to a list and then repeat adding row after row

I am running this code to create the DF included in the picture below:
library(tidyverse)
library(espnscrapeR)
steelers_data <- espnscrapeR::get_nfl_pbp(401326308) %>%
group_by(drive_id) %>%
janitor::clean_names() %>%
dplyr::rename(
posteam = pos_team_abb,
qtr = drive_start_qtr,
desc = play_desc) %>%
select(game_id, drive_id, posteam, play_id, play_type, yards_gained, desc,
logo, qtr, drive_result, home_team_abb, away_team_abb, home_wp) %>%
mutate(home_wp = dplyr::lag(home_wp, 1)) %>%
mutate(away_wp = 1.00 - home_wp) %>%
filter(posteam == "PIT") %>%
filter(!play_type %in% c("Kickoff", "End Period", "End of Half", "Timeout", "Kickoff Return (Offense)", "End of Game"))
##testing
testing <- steelers_data %>%
filter(drive_id == "4013263082") %>%
group_by(play_id) %>%
summarize(wp_data = list(away_wp), .groups = "drop")
However, I would like to add the next row's wp_data to the prior. My desired output is this:
Any help is greatly appreciated.
Use purrr::accumulate:
library(tidyverse)
data.frame(A = 1:10) %>%
mutate(A2 = purrr::accumulate(A, c))
or Reduce:
df <- data.frame(A = 1:10)
df$A2 <- Reduce(c, df$A, accumulate = T)
output
A A2
1 1 1
2 2 1, 2
3 3 1, 2, 3
4 4 1, 2, 3, 4
5 5 1, 2, 3, 4, 5
6 6 1, 2, 3, 4, 5, 6
7 7 1, 2, 3, 4, 5, 6, 7
8 8 1, 2, 3, 4, 5, 6, 7, 8
9 9 1, 2, 3, 4, 5, 6, 7, 8, 9
10 10 1, 2, 3, 4, 5, 6, 7, 8, 9, 10

How to sort each column of a df in descending order regarless of the row order?

I am trying to sort my data in descending or ascending order regardless of the data in the rows. I made a dummy example below:
A <- c(9,9,5,4,6,3,2,NA)
B <- c(9,5,3,4,1,4,NA,NA)
C <- c(1,4,5,6,7,4,2,4)
base <- data.frame(A,B,C)
df <- base
df$A <- sort(df$A,na.last = T)
df$B <- sort(df$B,na.last = T)
df$C <- sort(df$C)
We get this
structure(list(A = c(2, 3, 3, 4, 4, 4, 5, 5, 6, 9, 9, NA), B = c(1,
2, 3, 4, 4, 4, 5, 5, 9, 10, NA, NA), C = c(1, 2, 3, 4, 4, 4,
5, 5, 6, 7, 8, 8)), row.names = c(NA, -12L), class = "data.frame")
I want to get something similar to df but my data have hundreds of columns, is there an easier way to do it?
I tried arrange_all() but the result is not what i want.
library(tidyverse)
test <- base%>%
arrange_all()
Obtaining this:
structure(list(A = c(2, 3, 3, 4, 4, 4, 5, 5, 6, 9, 9, NA), B = c(NA,
2, 4, 4, 5, 10, 3, 4, 1, 5, 9, NA), C = c(2, 3, 4, 6, 8, 5, 5,
8, 7, 4, 1, 4)), class = "data.frame", row.names = c(NA, -12L
))
You can sort each column individually :
library(dplyr)
base %>% mutate(across(.fns = sort, na.last = TRUE))
# A B C
#1 2 1 1
#2 3 3 2
#3 4 4 4
#4 5 4 4
#5 6 5 4
#6 9 9 5
#7 9 NA 6
#8 NA NA 7
Or in base R :
base[] <- lapply(base, sort, na.last = TRUE)

R reshape wide to long: multiple variables, observations with multiple indicies

I have got some data containing observations with multiple idicies $y_{ibc}$ stored in a messy wide format. I have been fiddling around with tidyr and reshape2 but could not figure it out (reshaping really is my nemesis).
Here is an example:
df <- structure(list(id = c(1, 2, 3, 4, 5, 6, 7, 8, 9), a1b1c1 = c(5,
2, 1, 4, 3, 1, 0, 1, 3), a2b1c1 = c(3, 4, 1, 1, 3, 2, 1, 4, 4
), a3b1c1 = c(4, 0, 0, 1, 1, 1, 0, 0, 1), a1b2c1 = c(1, 0, 4,
2, 4, 1, 0, 4, 2), a2b2c1 = c(2, 0, 1, 0, 1, 0, 3, 2, 0), a3b2c1 = c(2,
4, 3, 0, 2, 3, 3, 3, 4), yc1 = c(1, 2, 2, 1, 2, 2, 2, 1, 1), a1b1c2 = c(4,
2, 3, 0, 4, 4, 2, 1, 4), a2b1c2 = c(3, 0, 3, 3, 4, 4, 3, 2, 2
), a3b1c2 = c(3, 1, 0, 1, 4, 0, 2, 2, 3), a1b2c2 = c(2, 2, 0,
3, 2, 1, 4, 1, 0), a2b2c2 = c(3, 0, 2, 3, 4, 4, 4, 0, 4), a3b2c2 = c(0,
0, 0, 2, 0, 0, 1, 4, 3), yc2 = c(2, 2, 2, 1, 2, 2, 2, 1, 1), X = c(5,
6, 3, 7, 4, 3, 2, 3, 2)), row.names = c(NA, -9L), class = c("tbl_df",
"tbl", "data.frame"))
This is what I want (excerpt):
id b c y a1 a2 a3 X
1 1 b1 c1 1 5 3 4 5
2 1 b2 c1 1 1 2 2 5
3 1 b1 c2 2 4 3 3 5
4 1 b2 c2 2 2 3 0 5
Using tidyr & dplyr:
library(tidyverse)
df %>%
pivot_longer(cols = matches("a.b.c."), names_to = "name", values_to = "value") %>%
separate(name, into = c("a", "b", "c"), sep = c(2,4)) %>%
mutate(y = case_when(c == "c1" ~ yc1,
c == "c2" ~ yc2)) %>%
pivot_wider(names_from = a, values_from = value) %>%
select(id, b, c, y, a1, a2, a3, X)
First, convert all your a/b/c columns to a long format & separate the 3 values into separate columns. Then combine your y columns into one depending on the value of c using mutate andcase_when (you could also use if_else for two options but case_when is more expandable for more values). Then pivot your a columns back to wide format and use select to put them in the right order and get rid of the yc1 and yc2 columns.

Creating a new variable based on numeric differences between two other variables in r

Here's an example dataset.
structure(list(vector1 = c(1, 4, 4, 2, 1, 3, 2, 3, 4, 5, 3, 5,
5, 1, 4, 2, 4, 5, 2, 5), vector2 = c(4, 2, 3, 5, 3, 5, 2, 2,
3, 3, 4, 1, 4, 1, 2, 1, 2, 1, 1, 2)), class = "data.frame", row.names = c(NA,
-20L))
Basically what I'm trying to do is create a new variable 'Direction' based on differences between these numbers. I want to say something like:
if vector2 == vector1 or vector2 == vector1 +/- 1 than Direction == 'NS'
if vector2 < vector1 -1 or if vector 2 > vector1 + 1 than Direction == 'EW'
Hopefully this makes sense. Thanks!
A similar solution is this (slightly simpler):
Data:
df <- data.frame(
vector1 = c(1, 4, 4, 2, 1, 3, 2, 3, 4, 5, 3, 5, 5, 1, 4, 2, 4, 5, 2, 5),
vector2 = c(4, 2, 3, 5, 3, 5, 2, 2, 3, 3, 4, 1, 4, 1, 2, 1, 2, 1, 1, 2)
)
Desired new column:
df$direction <- ifelse(df$vector1==vector2 |
df$vector1==vector2 + 1 |
df$vector1==vector2 - 1, "NS","EW")
Outcome:
df
vector1 vector2 direction
1 1 4 EW
2 4 2 EW
3 4 3 NS
4 2 5 EW
5 1 3 EW
6 3 5 EW
7 2 2 NS
8 3 2 NS
9 4 3 NS
10 5 3 EW
11 3 4 NS
12 5 1 EW
13 5 4 NS
14 1 1 NS
15 4 2 EW
16 2 1 NS
17 4 2 EW
18 5 1 EW
19 2 1 NS
20 5 2 EW
you can try this
df <- structure(list(vector1 = c(1, 4, 4, 2, 1, 3, 2, 3, 4, 5, 3, 5,
5, 1, 4, 2, 4, 5, 2, 5), vector2 = c(4, 2, 3, 5, 3, 5, 2, 2,
3, 3, 4, 1, 4, 1, 2, 1, 2, 1, 1, 2)), class = "data.frame", row.names = c(NA,
-20L))
df$direction <- with(df,ifelse((vector2 == vector1) | (vector2 == (vector1 + 1)) | (vector2 == (vector1 - 1)), "NS",
ifelse(vector2 < (vector1-1) | (vector2 > (vector1 + 1)),"EW", NA)))

Replicate a data frame by group

I have the following data frame:
df = structure(list(Group = c(1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3,
3), index = c(1, 2, 3, 4, 1, 2, 3, 4, 5, 6, 1, 2, 3)), row.names = c(NA,
-13L), class = c("tbl_df", "tbl", "data.frame"))
I would like to replicate the column index according to the Group column, one time with each number appearing n consecutive times, and a second time all the numbers appear as a group n times, where n is the size of the group (similarly to rep versus rep with each).
So the output would look like this (lets look only at Group 1 because it is too long):
First option:
df = structure(list(Group = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1), index = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4,
4, 4, 4)), row.names = c(NA, -16L), class = c("tbl_df", "tbl",
"data.frame"))
Second option:
df = structure(list(Group = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1), index = c(1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1,
2, 3, 4)), row.names = c(NA, -16L), class = c("tbl_df", "tbl",
"data.frame"))
How do I do this with group_by?
You could use rep and slice like this
library(dplyr)
Option 1:
df %>%
group_by(Group) %>%
slice(rep(seq_len(n()), each = n()))
Option 2 :
df %>%
group_by(Group) %>%
slice(rep(seq_len(n()), n()))
You can use a combination of do and lapply to replicate the whole group
df %>% group_by(Group) %>%
do(lapply(.,rep,times=nrow(.)) %>% as.data.frame())
df %>% group_by(Group) %>%
do(lapply(.,rep,each=nrow(.)) %>% as.data.frame())
We can use uncount
library(tidyverse)
df %>%
group_by(Group) %>%
uncount(n())
# A tibble: 61 x 2
# Groups: Group [3]
# Group index
# <dbl> <dbl>
# 1 1 1
# 2 1 1
# 3 1 1
# 4 1 1
# 5 1 2
# 6 1 2
# 7 1 2
# 8 1 2
# 9 1 3
#10 1 3
# … with 51 more rows
Or using data.table
library(data.table)
setDT(df)[, .SD[rep(seq_len(.N), .N)], Group]
Or with base R
do.call(rbind, lapply(split(df, df$Group),
function(x) x[rep(seq_len(nrow(x)), nrow(x)),]))

Resources