Related
Each day a company creates a value for category_1 and category_2.
A new company may enter the survey midway as company E appears on Dec 25.
Here are three days of data. So, two intervals: Dec 24-25 and Dec 25-26.
Question
For each category how many increase/decreases/no change were there over the 3 days?
For example, in cat1 A goes from a 2 to 1, B goes from a 3 to a 4, etc.
By hand I get:
cat1 - Up: 2, Down: 5, No change: 2
cat2 - Up: 6, Down: 2, No change: 1
How do I calculate the number of up/downs/no changes in an R Script?
library("tidyverse")
d1 <- as.Date("2022-12-24")
d2 <- as.Date("2022-12-25")
d3 <- as.Date("2022-12-26")
df <- tibble(
company = c(LETTERS[1:4], LETTERS[1:5], LETTERS[1:5]),
cat1 = c(2, 3, 4, 5, 1, 4, 5, 3, 2, 1, 4, 4, 2, 1),
cat2 = c(6, 7, 8, 9, 5, 5, 9, 10, 11, 6, 5, 10, 12, 13),
date = c(rep(d1, 4), rep(d2, 5), rep(d2, 5))
)
df
One approach using dplyr, assuming arranged data. Note: I changed the typo in date 3 to d3.
library(dplyr)
df %>%
group_by(company) %>%
mutate(cat1_change = cat1 - lag(cat1), cat2_change = cat2 - lag(cat2)) %>%
ungroup() %>%
summarize(type = c("up", "down", "no-change"),
across(ends_with("change"), ~
c(sum(.x > 0, na.rm=T), sum(.x < 0, na.rm=T), sum(.x == 0, na.rm=T))))
# A tibble: 3 × 3
type cat1_change cat2_change
<chr> <int> <int>
1 up 2 6
2 down 5 2
3 no-change 2 1
Data
df <- structure(list(company = c("A", "B", "C", "D", "A", "B", "C",
"D", "E", "A", "B", "C", "D", "E"), cat1 = c(2, 3, 4, 5, 1, 4,
5, 3, 2, 1, 4, 4, 2, 1), cat2 = c(6, 7, 8, 9, 5, 5, 9, 10, 11,
6, 5, 10, 12, 13), date = structure(c(19350, 19350, 19350, 19350,
19351, 19351, 19351, 19351, 19351, 19352, 19352, 19352, 19352,
19352), class = "Date")), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -14L))
An option with data.table - grouped by company, loop over the 'cat' column, get the diff of adjacent elements, convert to sign, and rename with factor labels, melt to long format and reshape back to 'wide' format with dcast
library(data.table)
dcast(melt(setDT(df)[, lapply(.SD, \(x) factor(sign(diff(x)),
levels = c(-1, 0, 1), labels = c("down", "no-change", "up"))),
company, .SDcols = patterns("^cat")], id.var = "company",
value.name = "type"), type ~ paste0(variable, "_change"), length)
-output
type cat1_change cat2_change
1: down 5 2
2: no-change 2 1
3: up 2 6
I am running this code to create the DF included in the picture below:
library(tidyverse)
library(espnscrapeR)
steelers_data <- espnscrapeR::get_nfl_pbp(401326308) %>%
group_by(drive_id) %>%
janitor::clean_names() %>%
dplyr::rename(
posteam = pos_team_abb,
qtr = drive_start_qtr,
desc = play_desc) %>%
select(game_id, drive_id, posteam, play_id, play_type, yards_gained, desc,
logo, qtr, drive_result, home_team_abb, away_team_abb, home_wp) %>%
mutate(home_wp = dplyr::lag(home_wp, 1)) %>%
mutate(away_wp = 1.00 - home_wp) %>%
filter(posteam == "PIT") %>%
filter(!play_type %in% c("Kickoff", "End Period", "End of Half", "Timeout", "Kickoff Return (Offense)", "End of Game"))
##testing
testing <- steelers_data %>%
filter(drive_id == "4013263082") %>%
group_by(play_id) %>%
summarize(wp_data = list(away_wp), .groups = "drop")
However, I would like to add the next row's wp_data to the prior. My desired output is this:
Any help is greatly appreciated.
Use purrr::accumulate:
library(tidyverse)
data.frame(A = 1:10) %>%
mutate(A2 = purrr::accumulate(A, c))
or Reduce:
df <- data.frame(A = 1:10)
df$A2 <- Reduce(c, df$A, accumulate = T)
output
A A2
1 1 1
2 2 1, 2
3 3 1, 2, 3
4 4 1, 2, 3, 4
5 5 1, 2, 3, 4, 5
6 6 1, 2, 3, 4, 5, 6
7 7 1, 2, 3, 4, 5, 6, 7
8 8 1, 2, 3, 4, 5, 6, 7, 8
9 9 1, 2, 3, 4, 5, 6, 7, 8, 9
10 10 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
I have a list of dataframes. It looks something like this:
df1 <- data.frame(Var1 = c(1, 7, 9, 4, 2),
Var2 = c(7, 2, 4, 4, 3),
Var3 = c(3, 6, 2, 0, 8))
df2 <- data.frame(Var1 = c(5, 6, 2, 2, 1),
Var2 = c(8, 6, 6, 7, 4),
Var3 = c(9, 0, 1, 3, 4))
df3.wxyz <- data.frame(Var1 = c("w", "x", "y", "z", 3, 7, 3, 6, 6),
Var2 = c(NA, NA, NA, NA, 7, 5, 8, 0, 2),
Var3 = c(NA, NA, NA, NA, 3, 3, 4, 1, 9))
df4 <- data.frame(Var1 = c(2, 7, 2, 4, 8),
Var2 = c(8, 3, 1, 7, 3),
Var3 = c(9, 1, 1, 6, 5))
df5.wxyz <- data.frame(Var1 = c("w", "x", "y", "z", 2, 7, 3, 1, 6),
Var2 = c(NA, NA, NA, NA, 7, 4, 8, 1, 9),
Var3 = c(NA, NA, NA, NA, 8, 0, 4, 1, 2))
df.list <- list(df1, df2, df3.wxyz, df4, df5.wxyz)
names(df.list) <- c("df1", "df2", "df3.wxyz", "df4", "df5.wxyz")
I would like to remove the first 4 rows of df3.wxyz and df5.wxyz from the list of dataframes as those contain information that I do not need. What I've tried is the following code, but instead of only removing the first 4 rows in df3.wxyz and df5.wxyz, it is removing the first 4 rows from every dataframe in my list. I'm not sure what the issue is.
df.list <- lapply(df.list, function(i){
ifelse(grepl("wxyz", names(df.list)), i <- i[-c(1:4), ], df.list)
i
})
This is what I would like to achieve:
df1 <- data.frame(Var1 = c(1, 7, 9, 4, 2),
Var2 = c(7, 2, 4, 4, 3),
Var3 = c(3, 6, 2, 0, 8))
df2 <- data.frame(Var1 = c(5, 6, 2, 2, 1),
Var2 = c(8, 6, 6, 7, 4),
Var3 = c(9, 0, 1, 3, 4))
df3.wxyz <- data.frame(Var1 = c(3, 7, 3, 6, 6),
Var2 = c(7, 5, 8, 0, 2),
Var3 = c(3, 3, 4, 1, 9))
df4 <- data.frame(Var1 = c(2, 7, 2, 4, 8),
Var2 = c(8, 3, 1, 7, 3),
Var3 = c(9, 1, 1, 6, 5))
df5.wxyz <- data.frame(Var1 = c(2, 7, 3, 1, 6),
Var2 = c(7, 4, 8, 1, 9),
Var3 = c(8, 0, 4, 1, 2))
df.list <- list(df1, df2, df3.wxyz, df4, df5.wxyz)
names(df.list) <- c("df1", "df2", "df3.wxyz", "df4", "df5.wxyz")
You can try,
df.list[grepl('wxyz', names(df.list))] <- lapply(df.list[grepl('wxyz', names(df.list))], na.omit)
You can try na.omit like below
> Map(na.omit,df.list)
$df1
Var1 Var2 Var3
1 1 7 3
2 7 2 6
3 9 4 2
4 4 4 0
5 2 3 8
$df2
Var1 Var2 Var3
1 5 8 9
2 6 6 0
3 2 6 1
4 2 7 3
5 1 4 4
$df3.wxyz
Var1 Var2 Var3
5 3 7 3
6 7 5 3
7 3 8 4
8 6 0 1
9 6 2 9
$df4
Var1 Var2 Var3
1 2 8 9
2 7 3 1
3 2 1 1
4 4 7 6
5 8 3 5
$df5.wxyz
Var1 Var2 Var3
5 2 7 8
6 7 4 0
7 3 8 4
8 1 1 1
9 6 9 2
I have the following code and am unsure how this would be written using dplyr
df <- data.frame(
a = c(1, 1, 1, 2, 2, 2, 2, 2),
b = c(1, 2, 3, 2, 3, 2, 3, 2),
c = c(1, 2, 3, 4, 3, 4, 3, 4),
d = c(1, 2, 3, 4, 5, 4, 5, 4),
e = c(1, 2, 3, 2, 3, 4, 3, 5)
)
n = 100
results=data.frame(levels=double(),amount=double())
for(i in 1:n){
r <- df %>% select_if(~n_distinct(.)==i)
if(dim(r)[2]>0){
results=rbind(results,data.frame(levels=i,amount=dim(r)[2]))
}
}
results
which outputs
levels amount
1 2 1
2 3 1
3 4 1
4 5 2
The use of the for loop and if statement makes me think there must be a
nicer approach though, or at least, one that makes use of dplyr instead.
edit
Data frame with different types
df <- data.frame(
a = c(1, 1, 1, 2, 2, 2, 2, 2),
b = c(1, 2, 3, 2, 3, 2, 3, 2),
c = c(1, 2, 3, 4, 3, 4, 3, 4),
d = c(1, 2, 3, 4, 5, 4, 5, 4),
e = c(1, 2, 3, 2, 3, 4, 3, 5),
f = c('a','b','a','a','a','a','a','b')
)
One dplyr and tidyr possibility could be:
df %>%
pivot_longer(everything()) %>%
group_by(name) %>%
summarise(n_levels = n_distinct(value)) %>%
ungroup() %>%
count(n_levels)
n_levels n
<int> <int>
1 2 1
2 3 1
3 4 1
4 5 2
library(dplyr)
library(tidyr)
df %>%
summarise_all(.funs = function(x) length(unique(x))) %>%
pivot_longer(everything()) %>% #OR gather %>%
count(value)
A base R approach could be :
stack(table(sapply(df, function(x) length(unique(x)))))
# ind values
#1 2 1
#2 3 1
#3 4 1
#4 5 2
I think this is a better way to do what you want. Using dplyr and purr.
library(tidyverse)
df <- data.frame(
a = c(1, 1, 1, 2, 2, 2, 2, 2),
b = c(1, 2, 3, 2, 3, 2, 3, 2),
c = c(1, 2, 3, 4, 3, 4, 3, 4),
d = c(1, 2, 3, 4, 5, 4, 5, 4),
e = c(1, 2, 3, 2, 3, 4, 3, 5)
)
map_df(df, function(d){
data.frame(level = n_distinct(d))
}) %>%
group_by(level) %>%
summarise(amount = n())
I have the following data frame:
df = structure(list(Group = c(1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3,
3), index = c(1, 2, 3, 4, 1, 2, 3, 4, 5, 6, 1, 2, 3)), row.names = c(NA,
-13L), class = c("tbl_df", "tbl", "data.frame"))
I would like to replicate the column index according to the Group column, one time with each number appearing n consecutive times, and a second time all the numbers appear as a group n times, where n is the size of the group (similarly to rep versus rep with each).
So the output would look like this (lets look only at Group 1 because it is too long):
First option:
df = structure(list(Group = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1), index = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4,
4, 4, 4)), row.names = c(NA, -16L), class = c("tbl_df", "tbl",
"data.frame"))
Second option:
df = structure(list(Group = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1), index = c(1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1,
2, 3, 4)), row.names = c(NA, -16L), class = c("tbl_df", "tbl",
"data.frame"))
How do I do this with group_by?
You could use rep and slice like this
library(dplyr)
Option 1:
df %>%
group_by(Group) %>%
slice(rep(seq_len(n()), each = n()))
Option 2 :
df %>%
group_by(Group) %>%
slice(rep(seq_len(n()), n()))
You can use a combination of do and lapply to replicate the whole group
df %>% group_by(Group) %>%
do(lapply(.,rep,times=nrow(.)) %>% as.data.frame())
df %>% group_by(Group) %>%
do(lapply(.,rep,each=nrow(.)) %>% as.data.frame())
We can use uncount
library(tidyverse)
df %>%
group_by(Group) %>%
uncount(n())
# A tibble: 61 x 2
# Groups: Group [3]
# Group index
# <dbl> <dbl>
# 1 1 1
# 2 1 1
# 3 1 1
# 4 1 1
# 5 1 2
# 6 1 2
# 7 1 2
# 8 1 2
# 9 1 3
#10 1 3
# … with 51 more rows
Or using data.table
library(data.table)
setDT(df)[, .SD[rep(seq_len(.N), .N)], Group]
Or with base R
do.call(rbind, lapply(split(df, df$Group),
function(x) x[rep(seq_len(nrow(x)), nrow(x)),]))