I have the following data frame:
df = structure(list(Group = c(1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3,
3), index = c(1, 2, 3, 4, 1, 2, 3, 4, 5, 6, 1, 2, 3)), row.names = c(NA,
-13L), class = c("tbl_df", "tbl", "data.frame"))
I would like to replicate the column index according to the Group column, one time with each number appearing n consecutive times, and a second time all the numbers appear as a group n times, where n is the size of the group (similarly to rep versus rep with each).
So the output would look like this (lets look only at Group 1 because it is too long):
First option:
df = structure(list(Group = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1), index = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4,
4, 4, 4)), row.names = c(NA, -16L), class = c("tbl_df", "tbl",
"data.frame"))
Second option:
df = structure(list(Group = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1), index = c(1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1,
2, 3, 4)), row.names = c(NA, -16L), class = c("tbl_df", "tbl",
"data.frame"))
How do I do this with group_by?
You could use rep and slice like this
library(dplyr)
Option 1:
df %>%
group_by(Group) %>%
slice(rep(seq_len(n()), each = n()))
Option 2 :
df %>%
group_by(Group) %>%
slice(rep(seq_len(n()), n()))
You can use a combination of do and lapply to replicate the whole group
df %>% group_by(Group) %>%
do(lapply(.,rep,times=nrow(.)) %>% as.data.frame())
df %>% group_by(Group) %>%
do(lapply(.,rep,each=nrow(.)) %>% as.data.frame())
We can use uncount
library(tidyverse)
df %>%
group_by(Group) %>%
uncount(n())
# A tibble: 61 x 2
# Groups: Group [3]
# Group index
# <dbl> <dbl>
# 1 1 1
# 2 1 1
# 3 1 1
# 4 1 1
# 5 1 2
# 6 1 2
# 7 1 2
# 8 1 2
# 9 1 3
#10 1 3
# … with 51 more rows
Or using data.table
library(data.table)
setDT(df)[, .SD[rep(seq_len(.N), .N)], Group]
Or with base R
do.call(rbind, lapply(split(df, df$Group),
function(x) x[rep(seq_len(nrow(x)), nrow(x)),]))
Related
Each day a company creates a value for category_1 and category_2.
A new company may enter the survey midway as company E appears on Dec 25.
Here are three days of data. So, two intervals: Dec 24-25 and Dec 25-26.
Question
For each category how many increase/decreases/no change were there over the 3 days?
For example, in cat1 A goes from a 2 to 1, B goes from a 3 to a 4, etc.
By hand I get:
cat1 - Up: 2, Down: 5, No change: 2
cat2 - Up: 6, Down: 2, No change: 1
How do I calculate the number of up/downs/no changes in an R Script?
library("tidyverse")
d1 <- as.Date("2022-12-24")
d2 <- as.Date("2022-12-25")
d3 <- as.Date("2022-12-26")
df <- tibble(
company = c(LETTERS[1:4], LETTERS[1:5], LETTERS[1:5]),
cat1 = c(2, 3, 4, 5, 1, 4, 5, 3, 2, 1, 4, 4, 2, 1),
cat2 = c(6, 7, 8, 9, 5, 5, 9, 10, 11, 6, 5, 10, 12, 13),
date = c(rep(d1, 4), rep(d2, 5), rep(d2, 5))
)
df
One approach using dplyr, assuming arranged data. Note: I changed the typo in date 3 to d3.
library(dplyr)
df %>%
group_by(company) %>%
mutate(cat1_change = cat1 - lag(cat1), cat2_change = cat2 - lag(cat2)) %>%
ungroup() %>%
summarize(type = c("up", "down", "no-change"),
across(ends_with("change"), ~
c(sum(.x > 0, na.rm=T), sum(.x < 0, na.rm=T), sum(.x == 0, na.rm=T))))
# A tibble: 3 × 3
type cat1_change cat2_change
<chr> <int> <int>
1 up 2 6
2 down 5 2
3 no-change 2 1
Data
df <- structure(list(company = c("A", "B", "C", "D", "A", "B", "C",
"D", "E", "A", "B", "C", "D", "E"), cat1 = c(2, 3, 4, 5, 1, 4,
5, 3, 2, 1, 4, 4, 2, 1), cat2 = c(6, 7, 8, 9, 5, 5, 9, 10, 11,
6, 5, 10, 12, 13), date = structure(c(19350, 19350, 19350, 19350,
19351, 19351, 19351, 19351, 19351, 19352, 19352, 19352, 19352,
19352), class = "Date")), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -14L))
An option with data.table - grouped by company, loop over the 'cat' column, get the diff of adjacent elements, convert to sign, and rename with factor labels, melt to long format and reshape back to 'wide' format with dcast
library(data.table)
dcast(melt(setDT(df)[, lapply(.SD, \(x) factor(sign(diff(x)),
levels = c(-1, 0, 1), labels = c("down", "no-change", "up"))),
company, .SDcols = patterns("^cat")], id.var = "company",
value.name = "type"), type ~ paste0(variable, "_change"), length)
-output
type cat1_change cat2_change
1: down 5 2
2: no-change 2 1
3: up 2 6
I am running this code to create the DF included in the picture below:
library(tidyverse)
library(espnscrapeR)
steelers_data <- espnscrapeR::get_nfl_pbp(401326308) %>%
group_by(drive_id) %>%
janitor::clean_names() %>%
dplyr::rename(
posteam = pos_team_abb,
qtr = drive_start_qtr,
desc = play_desc) %>%
select(game_id, drive_id, posteam, play_id, play_type, yards_gained, desc,
logo, qtr, drive_result, home_team_abb, away_team_abb, home_wp) %>%
mutate(home_wp = dplyr::lag(home_wp, 1)) %>%
mutate(away_wp = 1.00 - home_wp) %>%
filter(posteam == "PIT") %>%
filter(!play_type %in% c("Kickoff", "End Period", "End of Half", "Timeout", "Kickoff Return (Offense)", "End of Game"))
##testing
testing <- steelers_data %>%
filter(drive_id == "4013263082") %>%
group_by(play_id) %>%
summarize(wp_data = list(away_wp), .groups = "drop")
However, I would like to add the next row's wp_data to the prior. My desired output is this:
Any help is greatly appreciated.
Use purrr::accumulate:
library(tidyverse)
data.frame(A = 1:10) %>%
mutate(A2 = purrr::accumulate(A, c))
or Reduce:
df <- data.frame(A = 1:10)
df$A2 <- Reduce(c, df$A, accumulate = T)
output
A A2
1 1 1
2 2 1, 2
3 3 1, 2, 3
4 4 1, 2, 3, 4
5 5 1, 2, 3, 4, 5
6 6 1, 2, 3, 4, 5, 6
7 7 1, 2, 3, 4, 5, 6, 7
8 8 1, 2, 3, 4, 5, 6, 7, 8
9 9 1, 2, 3, 4, 5, 6, 7, 8, 9
10 10 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
I am trying to sort my data in descending or ascending order regardless of the data in the rows. I made a dummy example below:
A <- c(9,9,5,4,6,3,2,NA)
B <- c(9,5,3,4,1,4,NA,NA)
C <- c(1,4,5,6,7,4,2,4)
base <- data.frame(A,B,C)
df <- base
df$A <- sort(df$A,na.last = T)
df$B <- sort(df$B,na.last = T)
df$C <- sort(df$C)
We get this
structure(list(A = c(2, 3, 3, 4, 4, 4, 5, 5, 6, 9, 9, NA), B = c(1,
2, 3, 4, 4, 4, 5, 5, 9, 10, NA, NA), C = c(1, 2, 3, 4, 4, 4,
5, 5, 6, 7, 8, 8)), row.names = c(NA, -12L), class = "data.frame")
I want to get something similar to df but my data have hundreds of columns, is there an easier way to do it?
I tried arrange_all() but the result is not what i want.
library(tidyverse)
test <- base%>%
arrange_all()
Obtaining this:
structure(list(A = c(2, 3, 3, 4, 4, 4, 5, 5, 6, 9, 9, NA), B = c(NA,
2, 4, 4, 5, 10, 3, 4, 1, 5, 9, NA), C = c(2, 3, 4, 6, 8, 5, 5,
8, 7, 4, 1, 4)), class = "data.frame", row.names = c(NA, -12L
))
You can sort each column individually :
library(dplyr)
base %>% mutate(across(.fns = sort, na.last = TRUE))
# A B C
#1 2 1 1
#2 3 3 2
#3 4 4 4
#4 5 4 4
#5 6 5 4
#6 9 9 5
#7 9 NA 6
#8 NA NA 7
Or in base R :
base[] <- lapply(base, sort, na.last = TRUE)
I have got some data containing observations with multiple idicies $y_{ibc}$ stored in a messy wide format. I have been fiddling around with tidyr and reshape2 but could not figure it out (reshaping really is my nemesis).
Here is an example:
df <- structure(list(id = c(1, 2, 3, 4, 5, 6, 7, 8, 9), a1b1c1 = c(5,
2, 1, 4, 3, 1, 0, 1, 3), a2b1c1 = c(3, 4, 1, 1, 3, 2, 1, 4, 4
), a3b1c1 = c(4, 0, 0, 1, 1, 1, 0, 0, 1), a1b2c1 = c(1, 0, 4,
2, 4, 1, 0, 4, 2), a2b2c1 = c(2, 0, 1, 0, 1, 0, 3, 2, 0), a3b2c1 = c(2,
4, 3, 0, 2, 3, 3, 3, 4), yc1 = c(1, 2, 2, 1, 2, 2, 2, 1, 1), a1b1c2 = c(4,
2, 3, 0, 4, 4, 2, 1, 4), a2b1c2 = c(3, 0, 3, 3, 4, 4, 3, 2, 2
), a3b1c2 = c(3, 1, 0, 1, 4, 0, 2, 2, 3), a1b2c2 = c(2, 2, 0,
3, 2, 1, 4, 1, 0), a2b2c2 = c(3, 0, 2, 3, 4, 4, 4, 0, 4), a3b2c2 = c(0,
0, 0, 2, 0, 0, 1, 4, 3), yc2 = c(2, 2, 2, 1, 2, 2, 2, 1, 1), X = c(5,
6, 3, 7, 4, 3, 2, 3, 2)), row.names = c(NA, -9L), class = c("tbl_df",
"tbl", "data.frame"))
This is what I want (excerpt):
id b c y a1 a2 a3 X
1 1 b1 c1 1 5 3 4 5
2 1 b2 c1 1 1 2 2 5
3 1 b1 c2 2 4 3 3 5
4 1 b2 c2 2 2 3 0 5
Using tidyr & dplyr:
library(tidyverse)
df %>%
pivot_longer(cols = matches("a.b.c."), names_to = "name", values_to = "value") %>%
separate(name, into = c("a", "b", "c"), sep = c(2,4)) %>%
mutate(y = case_when(c == "c1" ~ yc1,
c == "c2" ~ yc2)) %>%
pivot_wider(names_from = a, values_from = value) %>%
select(id, b, c, y, a1, a2, a3, X)
First, convert all your a/b/c columns to a long format & separate the 3 values into separate columns. Then combine your y columns into one depending on the value of c using mutate andcase_when (you could also use if_else for two options but case_when is more expandable for more values). Then pivot your a columns back to wide format and use select to put them in the right order and get rid of the yc1 and yc2 columns.
I have the following code and am unsure how this would be written using dplyr
df <- data.frame(
a = c(1, 1, 1, 2, 2, 2, 2, 2),
b = c(1, 2, 3, 2, 3, 2, 3, 2),
c = c(1, 2, 3, 4, 3, 4, 3, 4),
d = c(1, 2, 3, 4, 5, 4, 5, 4),
e = c(1, 2, 3, 2, 3, 4, 3, 5)
)
n = 100
results=data.frame(levels=double(),amount=double())
for(i in 1:n){
r <- df %>% select_if(~n_distinct(.)==i)
if(dim(r)[2]>0){
results=rbind(results,data.frame(levels=i,amount=dim(r)[2]))
}
}
results
which outputs
levels amount
1 2 1
2 3 1
3 4 1
4 5 2
The use of the for loop and if statement makes me think there must be a
nicer approach though, or at least, one that makes use of dplyr instead.
edit
Data frame with different types
df <- data.frame(
a = c(1, 1, 1, 2, 2, 2, 2, 2),
b = c(1, 2, 3, 2, 3, 2, 3, 2),
c = c(1, 2, 3, 4, 3, 4, 3, 4),
d = c(1, 2, 3, 4, 5, 4, 5, 4),
e = c(1, 2, 3, 2, 3, 4, 3, 5),
f = c('a','b','a','a','a','a','a','b')
)
One dplyr and tidyr possibility could be:
df %>%
pivot_longer(everything()) %>%
group_by(name) %>%
summarise(n_levels = n_distinct(value)) %>%
ungroup() %>%
count(n_levels)
n_levels n
<int> <int>
1 2 1
2 3 1
3 4 1
4 5 2
library(dplyr)
library(tidyr)
df %>%
summarise_all(.funs = function(x) length(unique(x))) %>%
pivot_longer(everything()) %>% #OR gather %>%
count(value)
A base R approach could be :
stack(table(sapply(df, function(x) length(unique(x)))))
# ind values
#1 2 1
#2 3 1
#3 4 1
#4 5 2
I think this is a better way to do what you want. Using dplyr and purr.
library(tidyverse)
df <- data.frame(
a = c(1, 1, 1, 2, 2, 2, 2, 2),
b = c(1, 2, 3, 2, 3, 2, 3, 2),
c = c(1, 2, 3, 4, 3, 4, 3, 4),
d = c(1, 2, 3, 4, 5, 4, 5, 4),
e = c(1, 2, 3, 2, 3, 4, 3, 5)
)
map_df(df, function(d){
data.frame(level = n_distinct(d))
}) %>%
group_by(level) %>%
summarise(amount = n())