left_join gives error with three key variables - r

I am getting getting an error when I try to merge two data sets using left_join with three "key" variables. Here is a reproducible example:
df1 <- tribble(
~var1, ~var2, ~var3, ~var4,
1, 1, 1, 4,
2, 2, 2, 5,
3, 3, 3, 6
)
df2 <- tribble(
~var1, ~var2, ~var3, ~var5,
1, 1, 1, 7,
2, 2, 2, 8,
3, 3, 3, 9
)
df3<- df1 %>%
left_join(df2, by = "var1", "var2", "var3")
This gives the following error:
Error: `suffix` must be a character vector of length 2, not string of length
1
I know that join must be having an issue naming the key variables that are used for join, but I don't understand why.
Here is what I would like to have as a result:
# A tibble: 3 x 5
var1 var2 var3 var4 var5
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 1 4 7
2 2 2 2 5 8
3 3 3 3 6 9
Solution (thanks #user127649)
df3<- df1 %>%
left_join(df2, by = c("var1", "var2", "var3"))

I'd do this:
df1 <- tribble(
~var1, ~var2, ~var3, ~var4,
1, 1, 1, 4,
2, 2, 2, 5,
3, 3, 3, 6
)
df2 <- tribble(
~var1, ~var2, ~var3, ~var5,
1, 1, 1, 7,
2, 2, 2, 8,
3, 3, 3, 9
)
df3 <- merge(df1, df2, all.x = TRUE)

Related

Calculate changes in columns of daily tidy data

Each day a company creates a value for category_1 and category_2.
A new company may enter the survey midway as company E appears on Dec 25.
Here are three days of data. So, two intervals: Dec 24-25 and Dec 25-26.
Question
For each category how many increase/decreases/no change were there over the 3 days?
For example, in cat1 A goes from a 2 to 1, B goes from a 3 to a 4, etc.
By hand I get:
cat1 - Up: 2, Down: 5, No change: 2
cat2 - Up: 6, Down: 2, No change: 1
How do I calculate the number of up/downs/no changes in an R Script?
library("tidyverse")
d1 <- as.Date("2022-12-24")
d2 <- as.Date("2022-12-25")
d3 <- as.Date("2022-12-26")
df <- tibble(
company = c(LETTERS[1:4], LETTERS[1:5], LETTERS[1:5]),
cat1 = c(2, 3, 4, 5, 1, 4, 5, 3, 2, 1, 4, 4, 2, 1),
cat2 = c(6, 7, 8, 9, 5, 5, 9, 10, 11, 6, 5, 10, 12, 13),
date = c(rep(d1, 4), rep(d2, 5), rep(d2, 5))
)
df
One approach using dplyr, assuming arranged data. Note: I changed the typo in date 3 to d3.
library(dplyr)
df %>%
group_by(company) %>%
mutate(cat1_change = cat1 - lag(cat1), cat2_change = cat2 - lag(cat2)) %>%
ungroup() %>%
summarize(type = c("up", "down", "no-change"),
across(ends_with("change"), ~
c(sum(.x > 0, na.rm=T), sum(.x < 0, na.rm=T), sum(.x == 0, na.rm=T))))
# A tibble: 3 × 3
type cat1_change cat2_change
<chr> <int> <int>
1 up 2 6
2 down 5 2
3 no-change 2 1
Data
df <- structure(list(company = c("A", "B", "C", "D", "A", "B", "C",
"D", "E", "A", "B", "C", "D", "E"), cat1 = c(2, 3, 4, 5, 1, 4,
5, 3, 2, 1, 4, 4, 2, 1), cat2 = c(6, 7, 8, 9, 5, 5, 9, 10, 11,
6, 5, 10, 12, 13), date = structure(c(19350, 19350, 19350, 19350,
19351, 19351, 19351, 19351, 19351, 19352, 19352, 19352, 19352,
19352), class = "Date")), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -14L))
An option with data.table - grouped by company, loop over the 'cat' column, get the diff of adjacent elements, convert to sign, and rename with factor labels, melt to long format and reshape back to 'wide' format with dcast
library(data.table)
dcast(melt(setDT(df)[, lapply(.SD, \(x) factor(sign(diff(x)),
levels = c(-1, 0, 1), labels = c("down", "no-change", "up"))),
company, .SDcols = patterns("^cat")], id.var = "company",
value.name = "type"), type ~ paste0(variable, "_change"), length)
-output
type cat1_change cat2_change
1: down 5 2
2: no-change 2 1
3: up 2 6

Add next row's value to a list and then repeat adding row after row

I am running this code to create the DF included in the picture below:
library(tidyverse)
library(espnscrapeR)
steelers_data <- espnscrapeR::get_nfl_pbp(401326308) %>%
group_by(drive_id) %>%
janitor::clean_names() %>%
dplyr::rename(
posteam = pos_team_abb,
qtr = drive_start_qtr,
desc = play_desc) %>%
select(game_id, drive_id, posteam, play_id, play_type, yards_gained, desc,
logo, qtr, drive_result, home_team_abb, away_team_abb, home_wp) %>%
mutate(home_wp = dplyr::lag(home_wp, 1)) %>%
mutate(away_wp = 1.00 - home_wp) %>%
filter(posteam == "PIT") %>%
filter(!play_type %in% c("Kickoff", "End Period", "End of Half", "Timeout", "Kickoff Return (Offense)", "End of Game"))
##testing
testing <- steelers_data %>%
filter(drive_id == "4013263082") %>%
group_by(play_id) %>%
summarize(wp_data = list(away_wp), .groups = "drop")
However, I would like to add the next row's wp_data to the prior. My desired output is this:
Any help is greatly appreciated.
Use purrr::accumulate:
library(tidyverse)
data.frame(A = 1:10) %>%
mutate(A2 = purrr::accumulate(A, c))
or Reduce:
df <- data.frame(A = 1:10)
df$A2 <- Reduce(c, df$A, accumulate = T)
output
A A2
1 1 1
2 2 1, 2
3 3 1, 2, 3
4 4 1, 2, 3, 4
5 5 1, 2, 3, 4, 5
6 6 1, 2, 3, 4, 5, 6
7 7 1, 2, 3, 4, 5, 6, 7
8 8 1, 2, 3, 4, 5, 6, 7, 8
9 9 1, 2, 3, 4, 5, 6, 7, 8, 9
10 10 1, 2, 3, 4, 5, 6, 7, 8, 9, 10

Remove rows from list of dataframes based on condition

I have a list of dataframes. It looks something like this:
df1 <- data.frame(Var1 = c(1, 7, 9, 4, 2),
Var2 = c(7, 2, 4, 4, 3),
Var3 = c(3, 6, 2, 0, 8))
df2 <- data.frame(Var1 = c(5, 6, 2, 2, 1),
Var2 = c(8, 6, 6, 7, 4),
Var3 = c(9, 0, 1, 3, 4))
df3.wxyz <- data.frame(Var1 = c("w", "x", "y", "z", 3, 7, 3, 6, 6),
Var2 = c(NA, NA, NA, NA, 7, 5, 8, 0, 2),
Var3 = c(NA, NA, NA, NA, 3, 3, 4, 1, 9))
df4 <- data.frame(Var1 = c(2, 7, 2, 4, 8),
Var2 = c(8, 3, 1, 7, 3),
Var3 = c(9, 1, 1, 6, 5))
df5.wxyz <- data.frame(Var1 = c("w", "x", "y", "z", 2, 7, 3, 1, 6),
Var2 = c(NA, NA, NA, NA, 7, 4, 8, 1, 9),
Var3 = c(NA, NA, NA, NA, 8, 0, 4, 1, 2))
df.list <- list(df1, df2, df3.wxyz, df4, df5.wxyz)
names(df.list) <- c("df1", "df2", "df3.wxyz", "df4", "df5.wxyz")
I would like to remove the first 4 rows of df3.wxyz and df5.wxyz from the list of dataframes as those contain information that I do not need. What I've tried is the following code, but instead of only removing the first 4 rows in df3.wxyz and df5.wxyz, it is removing the first 4 rows from every dataframe in my list. I'm not sure what the issue is.
df.list <- lapply(df.list, function(i){
ifelse(grepl("wxyz", names(df.list)), i <- i[-c(1:4), ], df.list)
i
})
This is what I would like to achieve:
df1 <- data.frame(Var1 = c(1, 7, 9, 4, 2),
Var2 = c(7, 2, 4, 4, 3),
Var3 = c(3, 6, 2, 0, 8))
df2 <- data.frame(Var1 = c(5, 6, 2, 2, 1),
Var2 = c(8, 6, 6, 7, 4),
Var3 = c(9, 0, 1, 3, 4))
df3.wxyz <- data.frame(Var1 = c(3, 7, 3, 6, 6),
Var2 = c(7, 5, 8, 0, 2),
Var3 = c(3, 3, 4, 1, 9))
df4 <- data.frame(Var1 = c(2, 7, 2, 4, 8),
Var2 = c(8, 3, 1, 7, 3),
Var3 = c(9, 1, 1, 6, 5))
df5.wxyz <- data.frame(Var1 = c(2, 7, 3, 1, 6),
Var2 = c(7, 4, 8, 1, 9),
Var3 = c(8, 0, 4, 1, 2))
df.list <- list(df1, df2, df3.wxyz, df4, df5.wxyz)
names(df.list) <- c("df1", "df2", "df3.wxyz", "df4", "df5.wxyz")
You can try,
df.list[grepl('wxyz', names(df.list))] <- lapply(df.list[grepl('wxyz', names(df.list))], na.omit)
You can try na.omit like below
> Map(na.omit,df.list)
$df1
Var1 Var2 Var3
1 1 7 3
2 7 2 6
3 9 4 2
4 4 4 0
5 2 3 8
$df2
Var1 Var2 Var3
1 5 8 9
2 6 6 0
3 2 6 1
4 2 7 3
5 1 4 4
$df3.wxyz
Var1 Var2 Var3
5 3 7 3
6 7 5 3
7 3 8 4
8 6 0 1
9 6 2 9
$df4
Var1 Var2 Var3
1 2 8 9
2 7 3 1
3 2 1 1
4 4 7 6
5 8 3 5
$df5.wxyz
Var1 Var2 Var3
5 2 7 8
6 7 4 0
7 3 8 4
8 1 1 1
9 6 9 2

Find number of variables on different levels within a dataframe using dplyr?

I have the following code and am unsure how this would be written using dplyr
df <- data.frame(
a = c(1, 1, 1, 2, 2, 2, 2, 2),
b = c(1, 2, 3, 2, 3, 2, 3, 2),
c = c(1, 2, 3, 4, 3, 4, 3, 4),
d = c(1, 2, 3, 4, 5, 4, 5, 4),
e = c(1, 2, 3, 2, 3, 4, 3, 5)
)
n = 100
results=data.frame(levels=double(),amount=double())
for(i in 1:n){
r <- df %>% select_if(~n_distinct(.)==i)
if(dim(r)[2]>0){
results=rbind(results,data.frame(levels=i,amount=dim(r)[2]))
}
}
results
which outputs
levels amount
1 2 1
2 3 1
3 4 1
4 5 2
The use of the for loop and if statement makes me think there must be a
nicer approach though, or at least, one that makes use of dplyr instead.
edit
Data frame with different types
df <- data.frame(
a = c(1, 1, 1, 2, 2, 2, 2, 2),
b = c(1, 2, 3, 2, 3, 2, 3, 2),
c = c(1, 2, 3, 4, 3, 4, 3, 4),
d = c(1, 2, 3, 4, 5, 4, 5, 4),
e = c(1, 2, 3, 2, 3, 4, 3, 5),
f = c('a','b','a','a','a','a','a','b')
)
One dplyr and tidyr possibility could be:
df %>%
pivot_longer(everything()) %>%
group_by(name) %>%
summarise(n_levels = n_distinct(value)) %>%
ungroup() %>%
count(n_levels)
n_levels n
<int> <int>
1 2 1
2 3 1
3 4 1
4 5 2
library(dplyr)
library(tidyr)
df %>%
summarise_all(.funs = function(x) length(unique(x))) %>%
pivot_longer(everything()) %>% #OR gather %>%
count(value)
A base R approach could be :
stack(table(sapply(df, function(x) length(unique(x)))))
# ind values
#1 2 1
#2 3 1
#3 4 1
#4 5 2
I think this is a better way to do what you want. Using dplyr and purr.
library(tidyverse)
df <- data.frame(
a = c(1, 1, 1, 2, 2, 2, 2, 2),
b = c(1, 2, 3, 2, 3, 2, 3, 2),
c = c(1, 2, 3, 4, 3, 4, 3, 4),
d = c(1, 2, 3, 4, 5, 4, 5, 4),
e = c(1, 2, 3, 2, 3, 4, 3, 5)
)
map_df(df, function(d){
data.frame(level = n_distinct(d))
}) %>%
group_by(level) %>%
summarise(amount = n())

Replicate a data frame by group

I have the following data frame:
df = structure(list(Group = c(1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3,
3), index = c(1, 2, 3, 4, 1, 2, 3, 4, 5, 6, 1, 2, 3)), row.names = c(NA,
-13L), class = c("tbl_df", "tbl", "data.frame"))
I would like to replicate the column index according to the Group column, one time with each number appearing n consecutive times, and a second time all the numbers appear as a group n times, where n is the size of the group (similarly to rep versus rep with each).
So the output would look like this (lets look only at Group 1 because it is too long):
First option:
df = structure(list(Group = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1), index = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4,
4, 4, 4)), row.names = c(NA, -16L), class = c("tbl_df", "tbl",
"data.frame"))
Second option:
df = structure(list(Group = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1), index = c(1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1,
2, 3, 4)), row.names = c(NA, -16L), class = c("tbl_df", "tbl",
"data.frame"))
How do I do this with group_by?
You could use rep and slice like this
library(dplyr)
Option 1:
df %>%
group_by(Group) %>%
slice(rep(seq_len(n()), each = n()))
Option 2 :
df %>%
group_by(Group) %>%
slice(rep(seq_len(n()), n()))
You can use a combination of do and lapply to replicate the whole group
df %>% group_by(Group) %>%
do(lapply(.,rep,times=nrow(.)) %>% as.data.frame())
df %>% group_by(Group) %>%
do(lapply(.,rep,each=nrow(.)) %>% as.data.frame())
We can use uncount
library(tidyverse)
df %>%
group_by(Group) %>%
uncount(n())
# A tibble: 61 x 2
# Groups: Group [3]
# Group index
# <dbl> <dbl>
# 1 1 1
# 2 1 1
# 3 1 1
# 4 1 1
# 5 1 2
# 6 1 2
# 7 1 2
# 8 1 2
# 9 1 3
#10 1 3
# … with 51 more rows
Or using data.table
library(data.table)
setDT(df)[, .SD[rep(seq_len(.N), .N)], Group]
Or with base R
do.call(rbind, lapply(split(df, df$Group),
function(x) x[rep(seq_len(nrow(x)), nrow(x)),]))

Resources