Reshape () and modify_shape() - r

df <- data.frame(
code1 = c ("ZAZ","ZAZ","ZAZ","ZAZ","ZAZ","ZAZ","JOZ","JOZ","JOZ","JOZ","JOZ","JOZ","TSV","TSV"),
code2 = c("NAN","NAN","NAN","NAN","NAN","NAN","NAN","NAN","NAN","NAN","NAN","NAN","TSA","TSA"),
start = c("Date1.1","Date1.1","Date1.3","Date1.3","Date1.5","Date1.5","Date3.1","Date3.1","Date3.3","Date3.3","Date3.5","Date3.5","Date 5.1","Date 5.1"),
end = c("Date2.1","Date2.1","Date2.3","Date2.3","Date2.5","Date2.5","Date4.1","Date4.1","Date4.3","Date4.3","Date4.5","Date4.5","Date6.1","Date6.1"),
price = c(1,2,3,4,5,6,1,2,3,4,5,6,1,2))
I'm trying to achieve:
I have so far done:
df <- df %>%
group_by(code1, code2,start,end) %>%
slice_min(price) #%>%
group_modify()
df <- df[order(df$price),]
All well explained in the image but in brief:
To group by code1,code2,start,end and select smallest price for each
Reshape sending start,end,price to different columns (max 3 start,end,price per key code1,code2
I understand that this can be done within group_modify() but unsure how
Any help so much appreciated!
Brian

Here is one way using dplyr and tidyr libraries.
For each group (code1, code2, start and end) calculate the minimum value of price.
Create an index column for code1 and code2. This is to name start, end and price as start_1, start_2 etc.
Get the data in wide format using pivot_wider.
library(dplyr)
library(tidyr)
df %>%
group_by(code1, code2, start, end) %>%
summarise(price = min(price, na.rm = TRUE)) %>%
group_by(code1, code2) %>%
mutate(index = row_number()) %>%
ungroup() %>%
pivot_wider(names_from = index, values_from = c(start, end, price),
names_vary = "slowest")
# code1 code2 start_1 end_1 price_1 start_2 end_2 price_2 start_3 end_3 price_3
# <chr> <chr> <chr> <chr> <dbl> <chr> <chr> <dbl> <chr> <chr> <dbl>
#1 JOZ NAN Date3.1 Date4.1 1 Date3.3 Date4.3 3 Date3.5 Date4.5 5
#2 TSV TSA Date 5.1 Date6.1 1 NA NA NA NA NA NA
#3 ZAZ NAN Date1.1 Date2.1 1 Date1.3 Date2.3 3 Date1.5 Date2.5 5
Note that names_vary = "slowest" allows to have columns in an orderly fashion (start_1, end_1, price_1... instead of start_1, start_2 ..., end_1, end_2... etc. )

I guess you can try aggregate + reshape + ave (all from base R)
reshape(
transform(
aggregate(price ~ ., df, min),
id = ave(seq_along(price), code1, code2, FUN = seq_along)
),
direction = "wide",
idvar = c("code1", "code2"),
timevar = "id"
)
which gives
code1 code2 start.1 end.1 price.1 start.2 end.2 price.2 start.3 end.3
1 ZAZ NAN Date1.1 Date2.1 1 Date1.3 Date2.3 3 Date1.5 Date2.5
4 JOZ NAN Date3.1 Date4.1 1 Date3.3 Date4.3 3 Date3.5 Date4.5
7 TSV TSA Date5.1 Date6.1 1 <NA> <NA> NA <NA> <NA>
price.3
1 5
4 5
7 NA

Related

Summarize one variable/column over all possible values of other variables/columns

I need to summarize one variable/column of a long table after aggregating (group_by()) by another variable/column, I need to have the summarized value by all values of other variables/columns.
Here is test data:
library(tidyverse)
set.seed(123)
Site <- str_c("S", 1:5)
Species <- str_c("Sps", 1:6)
print(Species_tbl <- bind_cols(Species = Species,
Exotic = rbinom(length(Species), 1, .3),
Migrant = rbinom(length(Species), 2, .3)))
Data_tbl <- expand.grid(Site = Site,
Species = Species) %>%
left_join(Species_tbl)
Data_tbl$Presence <- rbinom(nrow(Data_tbl), 1, .5)
And here is my best effort:
print(Data_tbl %>%
group_by(Site) %>%
summarise(N_sp = sum(Presence),
N_sp_Exo = sum(Presence[Exotic == 1]),
N_sp_Nat = sum(Presence[Exotic == 0]),
N_sp_M0 = sum(Presence[Migrant == 0]),
N_sp_M1 = sum(Presence[Migrant == 1]),
N_sp_M2 = sum(Presence[Migrant == 2])))
You can get the data in long format for your columns of interest c(Exotic, Migrant) and take sum of Presence columns for each unique column names and it's values. This can be merged with sum of each Site.
library(dplyr)
library(tidyr)
data1 <- Data_tbl %>%
group_by(Site) %>%
summarise(N_sp = sum(Presence))
data2 <- Data_tbl %>%
pivot_longer(cols = c(Exotic, Migrant)) %>%
group_by(Site, name, value) %>%
summarise(result = sum(Presence), .groups = "drop") %>%
pivot_wider(names_from = c(name, value), values_from = result)
inner_join(data1, data2, by = 'Site')
# Site N_sp Exotic_0 Exotic_1 Migrant_0 Migrant_1 Migrant_2
# <fct> <int> <int> <int> <int> <int> <int>
#1 S1 4 2 2 1 2 1
#2 S2 3 2 1 0 2 1
#3 S3 2 1 1 0 2 0
#4 S4 4 2 2 1 3 0
#5 S5 4 1 3 1 2 1
The answer has been divided in two steps for ease of readability. If you would like to do this in a single chain without creating temporary variables that can be done as well.

extract valus of another dataframe if value of one column is partially match in R

Sorry I didn't clarify my question,
my aim is if dt$id %in% df$id , extract df$score add to new column at dt,
I have a dataframe like this :
df <- tibble(
score = c(2587,002,885,901,2587,3371,3372,002),
id = c("AR01.0","AR01.1","AR01.12","ERS02.00","ERS02.01","ERS02.02","QR01","QR01.03"))
And I have another dataframe like
dt <- tibble(
id = c("AR01","QR01","KVC"),
city = c("AM", "Bis","CHB"))
I want to mutate a new column "score"
I want to got output like below :
id
city
score
AR01
AM
2587/2/885
ERS02
Bis
901/3371
KVC
CHB
NA
or
id
city
score
score2
score3
AR01
AM
2587
2
885
ERS02
Bis
901
3371
NA
KVC
CHB
NA
NA
NA
I tried to use ifelse to achieve but always got error,
do any one can provide ideas? Thank you.
A simple left_join (after mutateing id values in df) is required:
library(dplyr)
library(stringr)
left_join(df %>% mutate(id = str_extract(id, "[\\w]+")), dt, by = "id") %>%
group_by(id) %>%
summarise(across(city,first),
score = paste(score, collapse = "/"))
# A tibble: 3 × 3
id city score
<chr> <chr> <chr>
1 AR01 AM 2587/2/885
2 ERS02 NA 901/2587/3371
3 QR01 Bis 3372/2
For the second solution you can use separate:
library(dyplr)
library(stringr)
library(tidyr)
left_join(df %>% mutate(id = str_extract(id, "[\\w]+")), dt, by = "id") %>%
group_by(id) %>%
summarise(across(city,first),
score = paste(score, collapse = "/")) %>%
separate(score,
into = paste("score", 1:3),
sep = "/" )
# A tibble: 3 × 5
id city `score 1` `score 2` `score 3`
<chr> <chr> <chr> <chr> <chr>
1 AR01 AM 2587 2 885
2 ERS02 NA 901 2587 3371
3 QR01 Bis 3372 2 NA
You could create groups by extracting everything before the . using sub to group_by on and merge the rows with paste separated with / and right_join them by id like this:
library(tibble)
df <- tibble(
score = c(2587,002,885,901,2587,3371,3372,002),
id = c("AR01.0","AR01.1","AR01.12","ERS02.00","ERS02.01","ERS02.02","QR01","QR01.03"))
dt <- tibble(
id = c("AR01","QR01","KVC"),
city = c("AM", "Bis","CHB"))
library(dplyr)
df %>%
mutate(id = sub('\\..*', "", id)) %>%
group_by(id) %>%
mutate(score = paste(score, collapse = '/')) %>%
distinct(id, .keep_all = TRUE) %>%
ungroup() %>%
right_join(., dt, by = 'id')
#> # A tibble: 3 × 3
#> score id city
#> <chr> <chr> <chr>
#> 1 2587/2/885 AR01 AM
#> 2 3372/2 QR01 Bis
#> 3 <NA> KVC CHB
Created on 2022-10-01 with reprex v2.0.2

how to replace NA with the value that later input with same ID and date

I have a data that looks and I want to filled n/a with the result that is later input with same ID and test_date, and only keep one record for each ID each day.
What should I do?
Here is the codes for sample data:
ID <-c("1", "1", "1","2", "2")
Test_date <-c("2020-07-09", "2020-07-09","2020-07-09", "2020-07-07","2020-07-08")
Art <-c("N/A","D","N/A","N/A", "B")
PE<-c("N/A","N/A","B","A","N/A")
Sample.data <- data.frame(ID, Test_date, Art, PE)
In Base-R
First change character strings "N/A" to actual NA
Sample.data[Sample.data=="N/A"] <- NA
now the the real meat of the answer
merge(
aggregate(Art ~ ID + Test_date, Sample.data, paste),
aggregate(PE ~ ID + Test_date, Sample.data, paste),
all=T
)
output:
ID Test_date Art PE
1 1 2020-07-09 D B
2 2 2020-07-07 <NA> A
3 2 2020-07-08 B <NA>
Using data.table:
library(data.table)
# Convert to data.table
setDT(Sample.data)
# Format NA properly as NA
Sample.data[, c("Art", "PE") := lapply(.SD, function(x) fifelse(x == "N/A", NA_character_, x)), .SDcols = c("Art", "PE")]
Sample.data[, .(Art[!is.na(Art)], PE[!is.na(PE)]), by = .(ID, Test_date)]
# ID Test_date V1 V2
# 1: 1 2020-07-09 D B
# 2: 2 2020-07-07 <NA> A
# 3: 2 2020-07-08 B <NA>
Alternatively:
Sample.data[, lapply(.SD, function(x) x[!is.na(x)]), by = .(ID, Test_date)]
(Edited to correct my misgrouping.)
I'm going to suggest a tidyverse solution to be expeditious, though this can be done (with a little more effort) in base R (and data.table).
A few tasks:
replace "N/A" (which is a completely valid and definite string) with NA (actually, NA_character_, since there are over six types of NA in R);
convert Test_date to a real Date class, and order by this;
fill up by group;
group by id/date and keep only one
The first few are done with
library(dplyr)
library(tidyr) # fill
Sample.data %>%
mutate(Test_date = as.Date(Test_date)) %>%
mutate_at(vars(Art, PE), ~ replace(., . == "N/A", NA_character_)) %>%
arrange(Test_date) %>%
group_by(ID, Test_date) %>%
tidyr::fill(., Art, PE, .direction = "up") %>%
ungroup()
# # A tibble: 5 x 4
# ID Test_date Art PE
# <chr> <date> <chr> <chr>
# 1 2 2020-07-07 <NA> A
# 2 2 2020-07-08 B <NA>
# 3 1 2020-07-09 D B
# 4 1 2020-07-09 D B
# 5 1 2020-07-09 <NA> B
though you need to think about what happens when your last observation is NA.
Now for your last point
and only keep one record for each ID each day
I'll expand the above with a little more. I'm going to infer first, but frankly you haven't provided enough information to know if it should be first, last, sum, max, row-with-the-fewest-NA-values, or whatever.
Sample.data %>%
mutate(Test_date = as.Date(Test_date)) %>%
mutate_at(vars(Art, PE), ~ replace(., . == "N/A", NA_character_)) %>%
arrange(Test_date) %>%
group_by(ID, Test_date) %>%
tidyr::fill(., Art, PE, .direction = "up") %>%
slice(1) %>%
ungroup()
# # A tibble: 3 x 4
# ID Test_date Art PE
# <chr> <date> <chr> <chr>
# 1 1 2020-07-09 D B
# 2 2 2020-07-07 <NA> A
# 3 2 2020-07-08 B <NA>

R transpose including NA

I have data like,
trackingnumer = c(1,1,2,2,3)
date = c("2017-08-01", "2017-08-10", "2017-08-02", "2017-08-05", "2017-08-12")
scan = c("Pickup", "Delivered", "Pickup", "Delivered", "Delivered")
df = data.frame(trackingnumer, date, scan)
I want to transpose this data by trackignumber
df2 <- df %>%
group_by(trackingnumer) %>%
mutate(n = row_number()) %>%
{data.table::dcast(data = setDT(.), trackingnumer ~ n, value.var = c('date', 'scan'))}
I have tried this one, but I couldn't get the desirable outcome.I want to set data_1 as pickup date, and date_2 as delivered date. As you can see, trackingnumber 3 doesn't have pickup record so I want date_1 to be NA.
Base R attempt, using relevel to set the appropriate ordering of the scan column:
reshape(
cbind(df, time=as.numeric(relevel(df$scan, "Pickup"))),
idvar="trackingnumer", direction="wide", sep="_"
)
# trackingnumer date_1 scan_1 date_2 scan_2
#1 1 2017-08-01 Pickup 2017-08-10 Delivered
#3 2 2017-08-02 Pickup 2017-08-05 Delivered
#5 3 <NA> <NA> 2017-08-12 Delivered
The problem was that your function in mutate was just counting the rows, it wasn’t paying attention to what was in them. The case_when() function lets you specify specific values for the “n” column based on the value of “scan”
df2 <- df %>%
group_by(trackingnumer) %>%
mutate(n = case_when(scan == "Pickup" ~ 1,
scan == "Delivered" ~ 2)) %>%
{data.table::dcast(data = setDT(.), trackingnumer ~ n, value.var = c('date', 'scan'))}
Or with tidyr
library(tidyr)
df %>% group_by(trackingnumer,scan2 = scan) %>%
nest(date,scan) %>%
spread(scan2,data) %>%
mutate_at(c("Delivered","Pickup"),~ifelse(map_lgl(.x,is_tibble),.x,lst(tibble(date=NA,scan=NA)))) %>%
unnest %>%
rename_at(c("date","scan"),paste0,2)
# # A tibble: 3 x 5
# trackingnumer date2 scan2 date1 scan1
# <dbl> <fctr> <fctr> <fctr> <fctr>
# 1 1 2017-08-10 Delivered 2017-08-01 Pickup
# 2 2 2017-08-05 Delivered 2017-08-02 Pickup
# 3 3 2017-08-12 Delivered <NA> <NA>

Conditional sum with dates in column names

Want to calculate conditional sum based on specified dates in r. My sample df is
start_date = c("7/24/2017", "7/1/2017", "7/25/2017")
end_date = c("7/27/2017", "7/4/2017", "7/28/2017")
`7/23/2017` = c(1,5,1)
`7/24/2017` = c(2,0,2)
`7/25/2017` = c(0,0,10)
`7/26/2017` = c(2,2,2)
`7/27/2017` = c(0,0,0)
df = data.frame(start_date,end_date,`7/23/2017`,`7/24/2017`,`7/25/2017`,`7/26/2017`,`7/27/2017`)
In Excel it looks like:
I want to perform calculations as specified in Column H which is a conditional sum of columns C through G based on the dates specified in columns A and B.
Apparently, Excel allows columns to be dates but not R.
#wide to long format
dat <- reshape(df, direction="long", varying=list(names(df)[3:7]), v.names="Value",
idvar=c("start_date","end_date"), timevar="Date",
times=seq(as.Date("2017/07/23"),as.Date("2017/07/27"), "day"))
#convert from factor to date class
dat$end_date <- as.Date(dat$end_date, format = "%m/%d/%Y")
dat$start_date <- as.Date(dat$start_date, format = "%m/%d/%Y")
library(dplyr)
dat %>% group_by(start_date, end_date) %>%
mutate(mval = ifelse(between(Date, start_date, end_date), Value, 0)) %>%
summarise(conditional_sum=sum(mval))
# # A tibble: 3 x 3
# # Groups: start_date [?]
# start_date end_date conditional_sum
# <date> <date> <dbl>
# 1 2017-07-01 2017-07-04 0
# 2 2017-07-24 2017-07-27 4
# 3 2017-07-25 2017-07-28 12
You could achieve that as follows:
# number of trailing columns without numeric values
c = 2
# create a separate vector with the dates
dates = as.Date(gsub("X","",tail(colnames(df),-c)),format="%m.%d.%Y")
# convert date columns in dataframe
df$start_date = as.Date(df$start_date,format="%m/%d/%Y")
df$end_date = as.Date(df$end_date,format="%m/%d/%Y")
# calculate sum
sapply(1:nrow(df),function(x) {y = df[x,(c+1):ncol(df)][dates %in%
seq(df$start_date[x],df$end_date[x],by="day") ]; ifelse(length(y)>0,sum(y),0) })
returns:
[1] 4 0 12
Hope this helps!
Here's a solution all in one dplyr pipe:
library(dplyr)
library(lubridate)
library(tidyr)
df %>%
gather(date, value, -c(1, 2)) %>%
mutate(date = gsub('X', '', date)) %>%
mutate(date = gsub('\\.', '/', date)) %>%
mutate(date = mdy(date)) %>%
filter(date >= mdy(start_date) & date <=mdy(end_date)) %>%
group_by(start_date, end_date) %>%
summarize(Conditional_Sum = sum(value)) %>%
right_join(df) %>%
mutate(Conditional_Sum = ifelse(is.na(Conditional_Sum), 0, Conditional_Sum)) %>%
select(-one_of('Conditional_Sum'), one_of('Conditional_Sum'))
## start_date end_date X7.23.2017 X7.24.2017 X7.25.2017 X7.26.2017 X7.27.2017 Conditional_Sum
## <fctr> <fctr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7/24/2017 7/27/2017 1 2 0 2 0 4
## 2 7/1/2017 7/4/2017 5 0 0 2 0 0
## 3 7/25/2017 7/28/2017 1 2 10 2 0 12

Resources