df = data.frame(group=c(1,1,1,2,2,2,3,3,3),
score=c(11,NA,7,NA,NA,4,6,9,15),
MAKE=c(11,11,11,4,4,4,15,15,15))
Say you have data as above with group and score and the objective is to make new variable MAKE which is just the maximum value of score for each group repeated.
And this is my attempt yet it does not work.
df %>%
group_by(group) %>%
summarise(Value = max(is.na(score)))
For that you need
df %>% group_by(group) %>% mutate(MAKE = max(score, na.rm = TRUE))
# A tibble: 9 x 3
# Groups: group [3]
# group score MAKE
# <dbl> <dbl> <dbl>
# 1 1 11 11
# 2 1 NA 11
# 3 1 7 11
# 4 2 NA 4
# 5 2 NA 4
# 6 2 4 4
# 7 3 6 15
# 8 3 9 15
# 9 3 15 15
The issue with max(is.na(score)) is that is.na(score) is a logical vector and when max is applied, it gets coerced to a binary vector with 1 for TRUE and 0 for FALSE. A somewhat less natural solution but closer to what you tried then would be
df %>% group_by(group) %>% mutate(MAKE = max(score[!is.na(score)]))
which finds the maximal value among all those values of score that are not NA.
Related
I have a dataset with financial data. Sometimes, a product gets refunded, resulting in a negative count of the product (so the money gets returned). I want to conditionally filter these rows out of the dataset.
Example:
library(tidyverse)
set.seed(1)
df <- tibble(
count = sample(c(-1,1),80,replace = TRUE,prob=c(.2,.8)),
id = rep(1:4,20)
)
df %>%
group_by(id) %>%
summarize(total = sum(count))
# A tibble: 4 x 2
id total
<int> <dbl>
1 1 10
2 2 14
3 3 16
4 4 10
id = 1 has 15 positive counts and 5 negatives. (15 - 5= 10). I want to keep 10 values in df with id = 1 with the positive values.
id = 2 has 17 positive counts and 3 negatives. (17- 3 = 14). I want to keep 14 values in df with id = 2 with the positive values.
In the end, this condition should be True nrow(df) == sum(df$count)
Unfortunately, a filtering join such as anti_join() will remove all the rows. For some reason I cannot think of another option to filter the tibble.
Thanks for helping me!
You can "uncount" using the total column to get the number of repeats of each row.
df %>%
group_by(id) %>%
summarize(total = sum(count)) %>%
uncount(total) %>%
mutate(count = 1)
#> # A tibble: 50 x 2
#> id count
#> <int> <dbl>
#> 1 1 1
#> 2 1 1
#> 3 1 1
#> 4 1 1
#> 5 1 1
#> 6 1 1
#> 7 1 1
#> 8 1 1
#> 9 1 1
#> 10 1 1
#> # ... with 40 more rows
Created on 2022-10-21 with reprex v2.0.2
I have two dfs : df1 and df2 where the column names are dates. When I join the two df's I get columns like
date1.x, date1.y, date2.x, date2.y, date3.x, date3.y, date4.x, date4.y...........
I want to create new columns which have values which are multiplication of date1.x and date1.y and similarly for other date pairs as well.
df <- data.frame(id=11:13, date1.x=1:3, date2.x=4:6, date1.y=7:9, date2.y=10:12)
df
# id date1.x date2.x date1.y date2.y
# 1 11 1 4 7 10
# 2 12 2 5 8 11
# 3 13 3 6 9 12
grep("^date.*\\.x$", colnames(df), value = TRUE)
# [1] "date1.x" "date2.x"
datenms <- grep("^date.*\\.x$", colnames(df), value = TRUE)
### make sure all of our 'date#.x' columns have matching 'date#.y' columns
datenms <- datenms[ gsub("x$", "y", datenms) %in% colnames(df) ]
datenms
# [1] "date1.x" "date2.x"
subset(df, select = datenms)
# date1.x date2.x
# 1 1 4
# 2 2 5
# 3 3 6
subset(df, select = gsub("x$", "y", datenms))
# date1.y date2.y
# 1 7 10
# 2 8 11
# 3 9 12
subset(df, select = datenms) * subset(df, select = gsub("x$", "y", datenms))
# date1.x date2.x
# 1 7 40
# 2 16 55
# 3 27 72
There are a number of ways to do this, but I suggest that it is a good practice to get used to transforming your data into a format that is easy to work with. The first answer showed you one way to do what you want without transforming your data. My answer will show you how to transform the data so that calculation (this one and others) are easy, and then how to perform the calculation once the data is tidy.
Making your data tidy helps to perform easier aggregations, to graph results, to perform feature engineering for models, etc.
library(dplyr)
library(tidyr)
df <- data.frame(id=11:13, date1.x=1:3, date2.x=4:6, date1.y=7:9, date2.y=10:12)
df
# id date1.x date2.x date1.y date2.y
# 1 11 1 4 7 10
# 2 12 2 5 8 11
# 3 13 3 6 9 12
# Convert the data to a tidy format that is easier for computers to calculate
tidy_df <- df %>%
pivot_longer(
cols = starts_with("date"), # We are tidying any column starting with date
names_to = c("date_num","date_source"), # creating two columns for names
values_to = c("date_value"), # creating one column for values
names_prefix = "date", # removing the "date" prefix
names_sep = "\\." # splitting the names on the period `.`
)
tidy_df
# id date_num date_source date_value
# <int> <chr> <chr> <int>
# 1 11 1 x 1
# 2 11 2 x 4
# 3 11 1 y 7
# 4 11 2 y 10
# 5 12 1 x 2
# 6 12 2 x 5
# 7 12 1 y 8
# 8 12 2 y 11
# 9 13 1 x 3
# 10 13 2 x 6
# 11 13 1 y 9
# 12 13 2 y 12
# Now that the data is tidy we can do easier dataframe grouping and aggregation
tidy_df %>%
group_by(id,date_num) %>%
summarise(date_value_mult = prod(date_value)) %>%
ungroup()
# id date_num date_value_mult
# <int> <chr> <dbl>
# 1 11 1 7
# 2 11 2 40
# 3 12 1 16
# 4 12 2 55
# 5 13 1 27
# 6 13 2 72
# If/When you eventually want the data in a more human readable format you can
# pivot the data back into a human readable format. This is likely after all
# computer calculations are done and you want to present the data. For storing
# the data (such as in a database) you would not need/want this step.
tidy_df %>%
group_by(id,date_num) %>%
summarise(date_value_mult = prod(date_value)) %>%
ungroup() %>%
pivot_wider(
names_from = date_num,
values_from = date_value_mult,
names_prefix = "date"
)
# id date1 date2
# <int> <dbl> <dbl>
# 1 11 7 40
# 2 12 16 55
# 3 13 27 72
I have an incomplete time series dataframe and I need to insert rows of NAs for missing time stamps. There should always be 6 time stamps per day, which is indicated by the variable "Signal" (1-6) in the dataframe. I am trying to merge the incomplete dataframe A with a vector Bcontaining all Signals. Simplified example data below:
B <- rep(1:6,2)
A <- data.frame(Signal = c(1,2,3,5,1,2,4,5,6), var1 = c(1,1,1,1,1,1,1,1,1))
Expected <- data.frame(Signal = c(1,2,3,NA, 5, NA, 1,2,NA,4,5,6), var1 = c(1,1,1,NA,1,NA,1,1,NA,1,1,1)
Note that Brepresents a dataframe with multiple variables and the NAs in Expected are rows of NAs in the dataframe. Also the actual dataframe has more observations (84 in total).
Would be awesome if you guys could help me out!
If you already know there are 6 timestamps in a day you can do this without B. We can create groups for each day and use complete to add the missing observations with NA.
library(dplyr)
library(tidyr)
A %>%
group_by(gr = cumsum(c(TRUE, diff(Signal) < 0))) %>%
complete(Signal = 1:6) %>%
ungroup() %>%
select(-gr)
# Signal var1
# <dbl> <dbl>
# 1 1 1
# 2 2 1
# 3 3 1
# 4 4 NA
# 5 5 1
# 6 6 NA
# 7 1 1
# 8 2 1
# 9 3 NA
#10 4 1
#11 5 1
#12 6 1
If in the output you need Signal as NA for missing combination you can use
A %>%
group_by(gr = cumsum(c(TRUE, diff(Signal) < 0))) %>%
complete(Signal = 1:6) %>%
mutate(Signal = replace(Signal, is.na(var1), NA)) %>%
ungroup %>%
select(-gr)
# Signal var1
# <dbl> <dbl>
# 1 1 1
# 2 2 1
# 3 3 1
# 4 NA NA
# 5 5 1
# 6 NA NA
# 7 1 1
# 8 2 1
# 9 NA NA
#10 4 1
#11 5 1
#12 6 1
I want to compute the minimum distance between the current row and every row before it within each group. My data frame has several groups, and each group has multiple dates with longitude and latitude. I use a Haversine function to compute distance, and I need to apply this function as described above. The data frame looks like the following:
grp date long lat rowid
1 1 1995-07-01 11 12 1
2 1 1995-07-05 3 0 2
3 1 1995-07-09 13 4 3
4 1 1995-07-13 4 25 4
5 2 1995-03-07 12 6 1
6 2 1995-03-10 3 27 2
7 2 1995-03-13 34 8 3
8 2 1995-03-16 25 9 4
My current attempt uses purrrlyr::by_row, but the method is too slow. In practice, each group has thousands of dates and geographic positions. Here is part of my current attempt:
calc_min_distance <- function(df, grp.name, row){
df %>%
filter(
group_name==grp.name
) %>%
filter(
row_number() <= row
) %>%
mutate(
last.lat = last(lat),
last.long = last(long),
rowid = 1:n()
) %>%
group_by(rowid) %>%
purrrlyr::by_row(
~haversinedistance.fnct(.$last.long, .$last.lat, .$long, .$lat),
.collate='rows',
.to = 'min.distance'
) %>%
filter(
row_number() < n()
) %>%
summarise(
min = min(min.distance)
) %>%
.$min
}
df_dist <-
df %>%
group_by(grp_name) %>%
mutate(rowid = 1:n()) %>%
group_by(grp_name, rowid) %>%
purrrlyr::by_row(
~calc_min_distance(df, .$grp_name,.$rowid),
.collate='rows',
.to = 'min.distance'
) %>%
ungroup %>%
select(-rowid)
Suppose that distance is defined as (lat + long) for reference row - (lat + long) for each pairwise row less than the reference row. My expected output for grp 1 is the following:
grp date long lat rowid min.distance
1 1 1995-07-01 11 12 1 0
2 1 1995-07-05 3 0 2 -20
3 1 1995-07-09 13 4 3 -6
4 1 1995-07-13 4 25 4 6
How can I quickly compute the minimum distance between the current rowid and all rowids before it?
Here's how I would go about it. You need to calculate all the within-group pair-wise distances anyway, so we'll use geosphere::distm which is designed to do just that. I'd suggest stepping through my function line-by-line and looking at what it does, I think it will make sense.
library(geosphere)
find_min_dist_above = function(long, lat, fun = distHaversine) {
d = distm(x = cbind(long, lat), fun = fun)
d[lower.tri(d, diag = TRUE)] = NA
d[1, 1] = 0
return(apply(d, MAR = 2, min, na.rm = TRUE))
}
df %>% group_by(grp) %>%
mutate(min.distance = find_min_dist_above(long, lat))
# # A tibble: 8 x 6
# # Groups: grp [2]
# grp date long lat rowid min.distance
# <int> <fct> <int> <int> <int> <dbl>
# 1 1 1995-07-01 11 12 1 0
# 2 1 1995-07-05 3 0 2 1601842.
# 3 1 1995-07-09 13 4 3 917395.
# 4 1 1995-07-13 4 25 4 1623922.
# 5 2 1995-03-07 12 6 1 0
# 6 2 1995-03-10 3 27 2 2524759.
# 7 2 1995-03-13 34 8 3 2440596.
# 8 2 1995-03-16 25 9 4 997069.
Using this data:
df = read.table(text = ' grp date long lat rowid
1 1 1995-07-01 11 12 1
2 1 1995-07-05 3 0 2
3 1 1995-07-09 13 4 3
4 1 1995-07-13 4 25 4
5 2 1995-03-07 12 6 1
6 2 1995-03-10 3 27 2
7 2 1995-03-13 34 8 3
8 2 1995-03-16 25 9 4', h = TRUE)
I have the following data
data = tribble(~t,~key,~value,
1,"a",10,
2,"a",20,
3,"a",30,
1,"b",100,
2,"b",200,
3,"b",300,
1,"c",1000,
2,"c",2000,
3,"c",3000)
and would like to get the following result
result = tribble(~t,~key,~value,
1,"a",-20,
2,"a",-10,
3,"a",0,
1,"b",-200,
2,"b",-100,
3,"b",0,
1,"c",-2000,
2,"c",-3000,
3,"c",0)
The idea is that I would like to subtract the 3rd value from all of the other values in that group. I tried to group_by the key, but struggled on the row wise subtraction within the group
We can use the last function from the dplyr. The arrange function is to make sure your dataset are in the right order.
library(dplyr)
data2 <- data %>%
arrange(key, t) %>%
group_by(key) %>%
mutate(value = value - last(value)) %>%
ungroup()
data2
# # A tibble: 9 x 3
# t key value
# <dbl> <chr> <dbl>
# 1 1 a -20
# 2 2 a -10
# 3 3 a 0
# 4 1 b -200
# 5 2 b -100
# 6 3 b 0
# 7 1 c -2000
# 8 2 c -1000
# 9 3 c 0