reshape grouped data in R - r

I have the following data:
id <- c(1,1,1,1,2,2,2,2,2,2)
date <-as.Date(c("2007-06-22", "2007-06-22", "2007-07-13","2007-07-13",
"2019-10-05", "2019-10-05", "2019-11-07", "2019-11-07",
"2007-06-22","2007-06-22"))
value <-c(0,3,2,4,0,1,4,2,6,8)
mydata_1 <- data.frame(id, date, value)
mydata_1
id date value
1 2007-06-22 0
1 2007-06-22 3
1 2007-07-13 2
1 2007-07-13 4
2 2019-10-05 0
2 2019-10-05 1
2 2019-11-07 4
2 2019-11-07 2
2 2007-06-22 6
2 2007-06-22 8
I would like the data to look like this:
id <- c(1,1,2,2,2)
date <-as.Date(c("2007-06-22", "2007-07-13", "2019-10-05", "2019-11-07","2007-06-22"))
value.1 = c(0,2,0,4,6)
value.2 = c(3,4,1,2,8)
mydata_2 <- data.frame(id, date, value.1, value.2)
mydata_2
id date value.1 value.2
1 2007-06-22 0 3
1 2007-07-13 2 4
2 2019-10-05 0 1
2 2019-11-07 4 2
2 2007-06-22 6 8
I have tried below from (Reshaping data matrix in R) but since some of the dates are the same in the two different id's it is not working as intended
dateno <- with(mydata_1, ave(id, date, FUN = seq_along))
test2 <- transform(mydata_1, dateno = dateno)
reshape(test2, dir = "wide", idvar = c("id","date"), timevar = "dateno")

I think I have come up with an answer following this guide How to transpose a data frame by group using reshape2 library?
mydata_1 = mydata_1 %>% group_by(id,date) %>% mutate(id_2 = paste0("V",row_number()))
library(tidyr)
mydata_2 = spread(data = my, key = id_2, value = value)
mydata_2
id date V1 V2
<dbl> <date> <dbl> <dbl>
1 1 2007-06-22 0 3
2 1 2007-07-13 2 4
3 2 2007-06-22 6 8
4 2 2019-10-05 0 1
5 2 2019-11-07 4 2

Maybe sth. like this:
library(tidyverse)
id <- c(1, 1, 1, 1, 2, 2, 2, 2, 2, 2)
date <- as.Date(c(
"2007-06-22", "2007-06-22", "2007-07-13", "2007-07-13",
"2019-10-05", "2019-10-05", "2019-11-07", "2019-11-07",
"2007-06-22", "2007-06-22"
))
value <- c(0, 3, 2, 4, 0, 1, 4, 2, 6, 8)
mydata_1 <- data.frame(id, date, value)
mydata_1
mydata_1 %>%
group_by(id, date) %>%
mutate(visit = row_number()) %>%
complete(id, date, fill = list(value = 0)) %>%
pivot_wider(names_from = visit, values_from = value, names_prefix = "value.")
Created on 2021-11-25 by the reprex package (v2.0.1)

Another possible solution:
library(tidyverse)
id <- c(1,1,1,1,2,2,2,2,2,2)
date <-as.Date(c("2007-06-22", "2007-06-22", "2007-07-13","2007-07-13",
"2019-10-05", "2019-10-05", "2019-11-07", "2019-11-07",
"2007-06-22","2007-06-22"))
value <-c(0,3,2,4,0,1,4,2,6,8)
mydata_1 <- data.frame(id, date, value)
mydata_1 %>%
group_by(id, date) %>%
summarise(value = str_c(value, collapse = ","), .groups = "drop") %>%
separate(value, into=c("value1", "value2"), sep=",", convert = T)
#> # A tibble: 5 × 4
#> id date value1 value2
#> <dbl> <date> <int> <int>
#> 1 1 2007-06-22 0 3
#> 2 1 2007-07-13 2 4
#> 3 2 2007-06-22 6 8
#> 4 2 2019-10-05 0 1
#> 5 2 2019-11-07 4 2

Related

summarise by group returns 0 instead of NA if all values are NA

library(dplyr)
dat <-
data.frame(id = rep(c(1,2,3,4), each = 3),
value = c(NA, NA, NA, 0, 1, 2, 0, 1, NA, 1, 2,3))
dat %>%
dplyr::group_by(id) %>%
dplyr::summarise(value_sum = sum(value, na.rm = T))
# A tibble: 4 x 2
id value_sum
1 0
2 3
3 1
4 6
Is there any way I can return NA if all the entries in a group are NA. For e.g. id 1 has all the entries as NA so I want the value_sum to be NA as well.
# A tibble: 4 x 2
id value_sum
1 NA
2 3
3 1
4 6
One way is to use an if/else statement: If all is Na return NA else return sum():
dat %>%
dplyr::group_by(id) %>%
#dplyr::summarise(value_sum = sum(value, na.rm = F)) %>%
summarise(number = if(all(is.na(value))) NA_real_ else sum(value, na.rm = TRUE))
id number
<dbl> <dbl>
1 1 NA
2 2 3
3 3 1
4 4 6
We could use fsum
library(collapse)
fsum(dat$value, g = dat$id)
1 2 3 4
NA 3 1 6
Or with dplyr
library(dplyr)
dat %>%
group_by(id) %>%
summarise(number = fsum(value))
# A tibble: 4 × 2
id number
<dbl> <dbl>
1 1 NA
2 2 3
3 3 1
4 4 6

I want to dissociate lists in a data frame into single values to different columns in R

I have a database that goes like that:
d <- c(01, 02, 03, 04)
h <- c("19:00", "19:00", "07:00", "07:00")
p1 <- c(123, 321, 123, 123)
p2 <- c(321, 345, 567, 567)
df <- data.frame(date = d, hours = h, person1 = p1, person2 = p2)
I used this code to associate all the characteristics of each person1 in different columns:
EDITED: rn = rowid(person1, date, hours) is the actual code. Not rn = rowid(person1)
library(dplyr)
library(data.table)
library(tidyr)
df1 <- df %>%
mutate(rn = rowid(person1, date, hours)) %>%
pivot_wider(names_from = rn, values_from = c(date, hours, person2),
names_sep="")
But this code gives me this output:
# person1 date1 hours1 person21
# 123 c(1,3,4) c("19:00", "07:00", "07:00") c(321,567,567)
# 321 2 19:00 345
I Dont want it to repeat values like 07:00 or 567. I want it to give me each different value in different columns, ignoring repeated values. And if possible, organized like that:
# person1 date1 date2 date3 date4... hours1 hours2 ... person21 person22 person23 person24
# 123 01 NA 03 04 07:00 19:00 NA 321 NA 567
# 321 NA 02 NA NA NA 19:00 NA NA 345 NA
person21, 22, 23 and 24 being the first, second, third, fourth, and so on person of my df1$person1.
But the ideal output for me would be something like this:
# person1 d01 d02 d03 d04 ... h07:00 h19:00 ... p123 p321 p345 p567
# 123 1 0 1 1 ... 1 0 ... 1 0 0 1
# 321 0 1 0 0 ... 0 0 ... 1 0 0 1
How can I do this?
If we want to return a binary output, specify the values_fn and values_fill in pivot_wider
library(dplyr)
library(tidyr)
library(data.table)
df %>%
mutate(rn = rowid(person1)) %>%
pivot_wider(names_from = rn, values_from = c(date, hours, person2),
names_sep="", values_fn = length, values_fill = list(date = 0, hours = 0, person2 = 0))
# A tibble: 2 x 10
# person1 date1 date2 date3 hours1 hours2 hours3 person21 person22 person23
# <dbl> <int> <int> <int> <int> <int> <int> <int> <int> <int>
#1 123 1 1 1 1 1 1 1 1 1
#2 321 1 0 0 1 0 0 1 0 0
If we want the values to be also column names, an option is to reshape into 'long' format first and then do the pivot_wider after transformation
df %>%
mutate(date = sprintf("%02d", date)) %>%
mutate(across(where(is.numeric), as.character)) %>%
pivot_longer(cols = -person1) %>%
mutate(name = substr(name, 1, 1)) %>%
unite(name, name, value, sep="") %>%
distinct(person1, name) %>%
mutate(n = 1) %>%
pivot_wider(names_from = name, values_from =n, values_fill = list(n = 0))
# A tibble: 2 x 10
# person1 d01 `h19:00` p321 d02 p345 d03 `h07:00` p567 d04
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 123 1 1 1 0 0 1 1 1 1
#2 321 0 1 0 1 1 0 0 0 0

Transpose and sum distinct values in R

IS there a way to transpose and summing distinct values in R For example
df
Cola Order Quantity Loc
ABC 1 4 LocA
ABC 1 4 LocB
CSD 4 6 LocA
CDS 3 2 LocB
We have same values for Order and Quantity but still need to take sum of it.
Expected Output (Transpose with respect to Quantity)
Cola Order Quantity LocA_Quantity Loc B_Quantity
ABC 2 8 4 4
CSD 4 6 6
CDS 3 2 2
Create the dataset:
library(tibble)
df = tribble(
~Cola, ~Order, ~Quantity, ~Loc,
'ABC', 1, 4, 'LocA',
'ABC', 1, 4, 'LocB',
'CSD', 4, 6, 'LocA',
'CDS', 3, 2, 'LocB'
)
Create the summaries:
library(dplyr)
df %>%
group_by(Cola) %>%
summarise(
Order = sum(Order),
LocA_Quantity = sum(Quantity * if_else(Loc == "LocA", 1, 0)),
LocB_Quantity = sum(Quantity * if_else(Loc == "LocB", 1, 0)),
Quantity = sum(Quantity)
)
You can do it for both Quantity and order and drop columns you dont want at the end, i.e.
library(tidyverse)
df %>%
group_by(Cola) %>%
mutate_at(vars(2:3), list(new = sum)) %>%
pivot_wider(names_from = Loc, values_from = 2:3)
## A tibble: 3 x 7
## Groups: Cola [3]
# Cola Order_new Quantity_new Order_LocA Order_LocB Quantity_LocA Quantity_LocB
# <fct> <int> <int> <int> <int> <int> <int>
#1 ABC 2 8 1 1 4 4
#2 CSD 4 6 4 NA 6 NA
#3 CDS 3 2 NA 3 NA 2
1) dplyr/tidyr Using the data shown reproducibly in the Note at the end, sum the orders and quantity and create a Quantity_ column equal to Quantity by Cola. Then reshape the Quantity_ column to wide form.
library(dplyr)
library(tidyr)
df %>%
group_by(Cola) %>%
mutate(Quantity_ = Quantity,
Order = sum(Order),
Quantity = sum(Quantity)) %>%
ungroup %>%
pivot_wider(names_from = "Loc", values_from = "Quantity_",
names_prefix = "Quantity_", values_fill = list(Quantity_ = 0))
giving:
# A tibble: 3 x 5
Cola Order Quantity Quantity_LocA Quantity_LocB
<chr> <int> <int> <int> <int>
1 ABC 2 8 4 4
2 CSD 4 6 6 0
3 CDS 3 2 0 2
2) Base R We can do much the same in base R using transform/ave and reshape like this:
df2 <- transform(df,
Quantity_ = Quantity,
Quantity = ave(Quantity, Cola, FUN = sum),
Order = ave(Order, Cola, FUN = sum))
wide <- reshape(df2, dir = "wide", idvar = c("Cola", "Quantity", "Order"),
timevar = "Loc", sep = "")
wide
## Cola Order Quantity Quantity_LocA Quantity_LocB
## 1 ABC 2 8 4 4
## 3 CSD 4 6 6 NA
## 4 CDS 3 2 NA 2
Note
Lines <- "Cola Order Quantity Loc
ABC 1 4 LocA
ABC 1 4 LocB
CSD 4 6 LocA
CDS 3 2 LocB"
df <- read.table(text = Lines, header = TRUE, as.is = TRUE)

Performing in group operations in R

I have a data in which I have 2 fields in a table sf -> Customer id and Buy_date. Buy_date is unique but for each customer, but there can be more than 3 different values of Buy_dates for each customer. I want to calculate difference in consecutive Buy_date for each Customer and its mean value. How can I do this.
Example
Customer Buy_date
1 2018/03/01
1 2018/03/19
1 2018/04/3
1 2018/05/10
2 2018/01/02
2 2018/02/10
2 2018/04/13
I want the results for each customer in the format
Customer mean
Here's a dplyr solution.
Your data:
df <- data.frame(Customer = c(1,1,1,1,2,2,2), Buy_date = c("2018/03/01", "2018/03/19", "2018/04/3", "2018/05/10", "2018/01/02", "2018/02/10", "2018/04/13"))
Grouping, mean Buy_date calculation and summarising:
library(dplyr)
df %>% group_by(Customer) %>% mutate(mean = mean(as.POSIXct(Buy_date))) %>% group_by(Customer, mean) %>% summarise()
Output:
# A tibble: 2 x 2
# Groups: Customer [?]
Customer mean
<dbl> <dttm>
1 1 2018-03-31 06:30:00
2 2 2018-02-17 15:40:00
Or as #r2evans points out in his comment for the consecutive days between Buy_dates:
df %>% group_by(Customer) %>% mutate(mean = mean(diff(as.POSIXct(Buy_date)))) %>% group_by(Customer, mean) %>% summarise()
Output:
# A tibble: 2 x 2
# Groups: Customer [?]
Customer mean
<dbl> <time>
1 1 23.3194444444444
2 2 50.4791666666667
I am not exactly sure of the desired output but this what I think you want.
library(dplyr)
library(zoo)
dat <- read.table(text =
"Customer Buy_date
1 2018/03/01
1 2018/03/19
1 2018/04/3
1 2018/05/10
2 2018/01/02
2 2018/02/10
2 2018/04/13", header = T, stringsAsFactors = F)
dat$Buy_date <- as.Date(dat$Buy_date)
dat %>% group_by(Customer) %>% mutate(diff_between = as.vector(diff(zoo(Buy_date), na.pad=TRUE)),
mean_days = mean(diff_between, na.rm = TRUE))
This produces:
Customer Buy_date diff_between mean_days
<int> <date> <dbl> <dbl>
1 1 2018-03-01 NA 23.3
2 1 2018-03-19 18 23.3
3 1 2018-04-03 15 23.3
4 1 2018-05-10 37 23.3
5 2 2018-01-02 NA 50.5
6 2 2018-02-10 39 50.5
7 2 2018-04-13 62 50.5
EDITED BASED ON USER COMMENTS:
Because you said that you have factors and not characters just convert them by doing the following:
dat$Buy_date <- as.Date(as.character(dat$Buy_date))
dat$Customer <- as.character(dat$Customer)

calculate column percentage difference

My df have more than 2 columns (for example, 4). I want to calculate the percentage difference of the column 2-4 compared to column 1.
time1 time2 time3 time4
1 5 1 10
2 2 2 4
3 6 3 12
My code is:
timepoint <- colnames(df)[2:4]
for (x in timepoint){
df$x <- 100*(df$x/df$time1-1)
}
What's wrong with this for function? Thank you!
Maybe this is a too long answer, but I think it's a start step
library(dplyr)
library(tibble)
library(tidyr)
df <- tribble(
~time1, ~time2, ~time3, ~time4,
1, 5, 1, 10,
2, 2, 2, 4,
3, 6, 3, 12
)
df_tidy <- df %>%
mutate(id = 1:nrow(.)) %>%
gather(time, value, time1:time4) %>%
mutate(id_base = ifelse(time == "time1", TRUE, FALSE))
df_calc <- filter(df_tidy, id_base == FALSE)
df_base <- df_tidy %>%
filter(id_base== TRUE) %>%
select(id, value_base = value)
df_join <- df_calc %>%
left_join(
df_base,
by = "id"
)
df_join %>%
mutate(diff = (value / value_base) * 100)
# A tibble: 9 × 4
id time value diff
<int> <chr> <dbl> <dbl>
1 1 time2 5 500
2 2 time2 2 100
3 3 time2 6 200
4 1 time3 1 100
5 2 time3 2 100
6 3 time3 3 100
7 1 time4 10 1000
8 2 time4 4 200
9 3 time4 12 400

Resources