I have a vector of dates with a certain amount of visits per date and the specific type of visit.
Data (the 'ficol' column can be ignored):
structure(c("2021-06-15", "15", "60", "T0s", "2021-06-16", "15",
"60", "T0s", "2021-06-17", " 8", "32", "T0s", "2021-06-21", "15",
"60", "T0s", "2021-06-22", "15", "60", "T0s", "2021-06-23", "15",
"60", "T0s", "2021-06-24", "15", "60", "T0s", "2021-06-28", "15",
"60", "T0s", "2021-06-29", "15", "60", "T0s", "2021-06-30", "15",
"60", "T0s", "2021-07-01", "15", "60", "T0s", "2021-07-05", "15",
"60", "T0s", "2021-07-06", "15", "60", "T0s", "2021-07-07", "15",
"60", "T0s", "2021-07-08", "15", "60", "T0s", "2021-07-12", "15",
"60", "T0s", "2021-07-13", "15", "60", "T0s", "2021-07-14", "15",
"60", "T0s", "2021-07-15", "15", "60", "T0s", "2021-07-19", "15",
"60", "T0s", "2021-07-20", "15", "60", "T0s", "2021-07-21", "15",
"60", "T0s", "2021-07-22", "15", "60", "T0s", "2021-07-26", "15",
"60", "T0s", "2021-07-27", "15", "60", "T0s", "2021-07-28", "15",
"60", "T0s", "2021-07-29", "15", "60", "T0s", "2021-06-30", "30",
"60", "T1s", "2021-07-01", " 8", "16", "T1s", "2021-07-05", "26",
"52", "T1s", "2021-07-06", "30", "60", "T1s", "2021-07-07", "30",
"60", "T1s", "2021-07-08", " 4", " 8", "T1s", "2021-07-12", " 4",
" 8", "T1s", "2021-07-13", "29", "58", "T1s", "2021-07-14", "27",
"54", "T1s", "2021-07-20", "30", "60", "T1s", "2021-07-21", "30",
"60", "T1s", "2021-07-26", "30", "60", "T1s", "2021-07-27", "30",
"60", "T1s", "2021-07-28", "30", "60", "T1s", "2021-08-02", "30",
"60", "T1s", "2021-08-03", " 8", "16", "T1s", "2021-08-23", "12",
"60", "T3s", "2021-08-24", "12", "60", "T3s", "2021-08-25", "12",
"60", "T3s", "2021-08-26", " 2", "10", "T3s", "2021-08-30", "12",
"60", "T3s", "2021-08-31", "12", "60", "T3s", "2021-09-01", "12",
"60", "T3s", "2021-09-06", "12", "60", "T3s", "2021-09-07", "12",
"60", "T3s", "2021-09-08", "12", "60", "T3s", "2021-09-13", "12",
"60", "T3s", "2021-09-14", "12", "60", "T3s", "2021-09-15", "12",
"60", "T3s", "2021-09-16", "12", "60", "T3s", "2021-09-20", "12",
"60", "T3s", "2021-09-21", "12", "60", "T3s", "2021-09-22", "12",
"60", "T3s", "2021-09-23", "12", "60", "T3s", "2021-09-27", "12",
"60", "T3s", "2022-01-10", "15", "60", "T5s", "2022-01-11", "15",
"60", "T5s", "2022-01-12", " 8", "32", "T5s", "2022-01-17", "15",
"60", "T5s", "2022-01-18", "15", "60", "T5s", "2022-01-19", " 6",
"24", "T5s", "2022-01-24", "15", "60", "T5s", "2022-01-25", "15",
"60", "T5s", "2022-01-26", " 6", "24", "T5s", "2022-01-31", "15",
"60", "T5s", "2022-02-01", "15", "60", "T5s", "2022-02-02", " 6",
"24", "T5s", "2022-02-03", "12", "48", "T5s", "2022-02-07", "15",
"60", "T5s", "2022-02-08", "15", "60", "T5s", "2022-02-09", " 6",
"24", "T5s", "2022-02-10", "15", "60", "T5s", "2022-02-14", " 9",
"36", "T5s"), .Dim = c(4L, 80L), .Dimnames = list(c("Var1", "Freq",
"ficol", "visit"), NULL))
What I would love is to have a calendar with the dates (Var1 in data) to be marked with a color per visit type. For example, visit T0s dates are green, T1s dates purple, T3s dates blue, etc..
I found the package calendR but I can't figure it out for a whole year, only per month using an example found here: . https://r-coder.com/calendar-plot-r/
Can anyone help me out? Greatly appreciated!
library(tidyverse)
library(lubridate)
library(assertr)
library(calendR)
df <- structure(c("2021-06-15", "15", "60", "T0s", "2021-06-16", "15",
"60", "T0s", "2021-06-17", " 8", "32", "T0s", "2021-06-21", "15",
"60", "T0s", "2021-06-22", "15", "60", "T0s", "2021-06-23", "15",
"60", "T0s", "2021-06-24", "15", "60", "T0s", "2021-06-28", "15",
"60", "T0s", "2021-06-29", "15", "60", "T0s", "2021-06-30", "15",
"60", "T0s", "2021-07-01", "15", "60", "T0s", "2021-07-05", "15",
"60", "T0s", "2021-07-06", "15", "60", "T0s", "2021-07-07", "15",
"60", "T0s", "2021-07-08", "15", "60", "T0s", "2021-07-12", "15",
"60", "T0s", "2021-07-13", "15", "60", "T0s", "2021-07-14", "15",
"60", "T0s", "2021-07-15", "15", "60", "T0s", "2021-07-19", "15",
"60", "T0s", "2021-07-20", "15", "60", "T0s", "2021-07-21", "15",
"60", "T0s", "2021-07-22", "15", "60", "T0s", "2021-07-26", "15",
"60", "T0s", "2021-07-27", "15", "60", "T0s", "2021-07-28", "15",
"60", "T0s", "2021-07-29", "15", "60", "T0s", "2021-06-30", "30",
"60", "T1s", "2021-07-01", " 8", "16", "T1s", "2021-07-05", "26",
"52", "T1s", "2021-07-06", "30", "60", "T1s", "2021-07-07", "30",
"60", "T1s", "2021-07-08", " 4", " 8", "T1s", "2021-07-12", " 4",
" 8", "T1s", "2021-07-13", "29", "58", "T1s", "2021-07-14", "27",
"54", "T1s", "2021-07-20", "30", "60", "T1s", "2021-07-21", "30",
"60", "T1s", "2021-07-26", "30", "60", "T1s", "2021-07-27", "30",
"60", "T1s", "2021-07-28", "30", "60", "T1s", "2021-08-02", "30",
"60", "T1s", "2021-08-03", " 8", "16", "T1s", "2021-08-23", "12",
"60", "T3s", "2021-08-24", "12", "60", "T3s", "2021-08-25", "12",
"60", "T3s", "2021-08-26", " 2", "10", "T3s", "2021-08-30", "12",
"60", "T3s", "2021-08-31", "12", "60", "T3s", "2021-09-01", "12",
"60", "T3s", "2021-09-06", "12", "60", "T3s", "2021-09-07", "12",
"60", "T3s", "2021-09-08", "12", "60", "T3s", "2021-09-13", "12",
"60", "T3s", "2021-09-14", "12", "60", "T3s", "2021-09-15", "12",
"60", "T3s", "2021-09-16", "12", "60", "T3s", "2021-09-20", "12",
"60", "T3s", "2021-09-21", "12", "60", "T3s", "2021-09-22", "12",
"60", "T3s", "2021-09-23", "12", "60", "T3s", "2021-09-27", "12",
"60", "T3s", "2022-01-10", "15", "60", "T5s", "2022-01-11", "15",
"60", "T5s", "2022-01-12", " 8", "32", "T5s", "2022-01-17", "15",
"60", "T5s", "2022-01-18", "15", "60", "T5s", "2022-01-19", " 6",
"24", "T5s", "2022-01-24", "15", "60", "T5s", "2022-01-25", "15",
"60", "T5s", "2022-01-26", " 6", "24", "T5s", "2022-01-31", "15",
"60", "T5s", "2022-02-01", "15", "60", "T5s", "2022-02-02", " 6",
"24", "T5s", "2022-02-03", "12", "48", "T5s", "2022-02-07", "15",
"60", "T5s", "2022-02-08", "15", "60", "T5s", "2022-02-09", " 6",
"24", "T5s", "2022-02-10", "15", "60", "T5s", "2022-02-14", " 9",
"36", "T5s"), .Dim = c(4L, 80L), .Dimnames = list(c("Var1", "Freq",
"ficol", "visit"), NULL))
df1 <- df %>%
t() %>%
as_tibble() %>%
mutate(
Var1 = ymd(Var1),
Freq = as.integer(Freq),
ficol = as.integer(ficol)) %>%
rename(date = Var1) %>%
arrange(date)
df2 <- df1 %>%
filter(year(date) == 2021) # choose only 1 year
df2
#> # A tibble: 62 x 4
#> date Freq ficol visit
#> <date> <int> <int> <chr>
#> 1 2021-06-15 15 60 T0s
#> 2 2021-06-16 15 60 T0s
#> 3 2021-06-17 8 32 T0s
#> 4 2021-06-21 15 60 T0s
#> 5 2021-06-22 15 60 T0s
#> 6 2021-06-23 15 60 T0s
#> 7 2021-06-24 15 60 T0s
#> 8 2021-06-28 15 60 T0s
#> 9 2021-06-29 15 60 T0s
#> 10 2021-06-30 15 60 T0s
#> # ... with 52 more rows
# some days have many types of visits, so it's necessary to group them
df3 <- df2 %>%
group_by(date) %>%
summarise(visits = str_c(sort(visit), collapse = ", "),
.groups = "drop")
df3
#> # A tibble: 48 x 2
#> date visits
#> <date> <chr>
#> 1 2021-06-15 T0s
#> 2 2021-06-16 T0s
#> 3 2021-06-17 T0s
#> 4 2021-06-21 T0s
#> 5 2021-06-22 T0s
#> 6 2021-06-23 T0s
#> 7 2021-06-24 T0s
#> 8 2021-06-28 T0s
#> 9 2021-06-29 T0s
#> 10 2021-06-30 T0s, T1s
#> # ... with 38 more rows
df3 %>%
count(visits)
#> # A tibble: 4 x 2
#> visits n
#> <chr> <int>
#> 1 T0s 13
#> 2 T0s, T1s 14
#> 3 T1s 2
#> 4 T3s 19
df4 <- df3 %>%
mutate(color = case_when(
visits == "T0s" ~ "red",
visits == "T0s, T1s" ~ "orange",
visits == "T1s" ~ "yellow",
visits == "T3s" ~ "green"
)) %>%
assertr::verify(!is.na(color)) %>%
full_join(
tibble(date = seq(as.Date("2021-01-01"), as.Date("2022-01-01") - 1, by = "days")),
by = "date"
) %>%
mutate(yday = lubridate::yday(date)) %>%
arrange(date)
df4 %>%
filter(!is.na(visits))
#> # A tibble: 48 x 4
#> date visits color yday
#> <date> <chr> <chr> <dbl>
#> 1 2021-06-15 T0s red 166
#> 2 2021-06-16 T0s red 167
#> 3 2021-06-17 T0s red 168
#> 4 2021-06-21 T0s red 172
#> 5 2021-06-22 T0s red 173
#> 6 2021-06-23 T0s red 174
#> 7 2021-06-24 T0s red 175
#> 8 2021-06-28 T0s red 179
#> 9 2021-06-29 T0s red 180
#> 10 2021-06-30 T0s, T1s orange 181
#> # ... with 38 more rows
calendR::calendR(year = 2021,
start = "M",
special.days = df4$visits,
special.col = unique(na.omit(df4$color)),
legend.pos = "right")
Related
Suppose a data:
df1 <- tibble::tribble(~"M1", ~"M2", ~"Beer, pints", ~"Coffee, oz", ~"Gasoline, galons", ~"Milk, galons", ~"Warehouse, square feet", ~"Nearest place, miles",
"NY", "22", "10", "12", "15", "100", "100", "20",
"NY", "20", "9", "10", "12", "100", "100", "20",
"NY", "18", "8", "9", "11", "100", "100", "20",
"M1", "M2", "Beer, liters", "Coffee, cups (120 ml)", "Gasoline, liters", "Milk, liters", "Warehouse, square meters", "Nearest place, kilometers",
"PR", "22", "7", "8", "9", "70", "67", "7",
"PR", "20", "6", "7", "8", "80", "75", "7",
"M1", "M2", "Beer, pints", "Coffee, oz", "Gasoline, liters", "Milk, liters", "Warehouse, square feet", "Nearest place, miles",
"KR", "22", "6", "6", "7", "60", "50", "9",
"KR", "20", "5", "6", "8", "55", "65", "9",
"KR", "18", "5", "6", "8", "50", "55", "9")
For visual representation:
Is there a nice method to recalculate all columns in the same metrics (like if it is liters, then the entrire column should be liters; if miles (not kilometers), then the entire column to be miles [based on condition in the subheadings inside]?
It could be great to think on the nicest methods to solve it.
PS: for information:
1 gallon = 3.78541 liters
1 pint = 0.473176 liters
1 oz = 0.0295735 liters
11 square feet = 1.02193 square meters
1 mile = 1.60934 kilometers
I am just wondering and just started to consider for solution.
I am interested to look for possible nice solutions.
In addition, it will be interesting for the entire R community to think on the best methods to edit the data by condition.
When the data is sloppy, we must also get our hands dirty.I thought of way, with many steps.
Data
df1 <-
structure(list(m1 = c("M1", "NY", "NY", "NY", "M1", "PR", "PR",
"M1", "KR", "KR", "KR"), m2 = c("M2", "22", "20", "18", "M2",
"22", "20", "M2", "22", "20", "18"), beer = c("Beer, pints",
"10", "9", "8", "Beer, liters", "7", "6", "Beer, pints", "6",
"5", "5"), coffee = c("Coffee, oz", "12", "10", "9", "Coffee, cups (120 ml)",
"8", "7", "Coffee, oz", "6", "6", "6"), gasoline = c("Gasoline, galons",
"15", "12", "11", "Gasoline, liters", "9", "8", "Gasoline, liters",
"7", "8", "8"), milk = c("Milk, galons", "100", "100", "100",
"Milk, liters", "70", "80", "Milk, liters", "60", "55", "50"),
warehouse = c("Warehouse, square feet", "100", "100", "100",
"Warehouse, square meters", "67", "75", "Warehouse, square feet",
"50", "65", "55"), nearest_place = c("Nearest_place, miles",
"20", "20", "20", "Nearest place, kilometers", "7", "7",
"Nearest place, miles", "9", "9", "9")), row.names = c(NA,
-11L), class = c("tbl_df", "tbl", "data.frame"))
Convert function
convert_unit <- function(value,unit){
m <-
case_when(
unit == "galons" ~ 3.78541,
unit == "pints" ~ 0.473176,
unit == "oz" ~ 0.0295735,
unit == "squarefeet" ~ 1.02193/11,
unit == "miles" ~ 1.02193/11,
TRUE ~ 1
)
output <- m*as.numeric(value)
return(output)
}
Data preparation
First, I would add the header as the first row and also create better names.
library(dplyr)
library(stringr)
library(tidyr)
#remotes::install_github("vbfelix/relper")
library(relper)
or_names <- names(df1)
new_names <- str_to_lower(str_select(or_names,before = ","))
n_row <- nrow(df1)
df1[2:(n_row+1),] <- df1
df1[1,] <- as.list(or_names)
names(df1) <- new_names
Data manipulation
Then, I would create new columns with the units, and the apply the function to each one.
df1 %>%
mutate(
across(.cols = -c(m1:m2),.fns = ~str_keep(str_select(.,after = ",")),.names = "{.col}_unit"),
aux = beer_unit == "",
across(.cols = ends_with("_unit"),~if_else(. == "",NA_character_,.))) %>%
fill(ends_with("_unit"),.direction = "down") %>%
filter(aux) %>%
mutate(
across(
.cols = beer:nearest_place,
.fns = ~convert_unit(value = .,unit = get(str_c(cur_column(),"_unit")))
)
) %>%
select(-aux,-ends_with("_unit"))
Output
# A tibble: 8 x 8
m1 m2 beer coffee gasoline milk warehouse nearest_place
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 NY 22 4.73 0.355 56.8 379. 9.29 1.86
2 NY 20 4.26 0.296 45.4 379. 9.29 1.86
3 NY 18 3.79 0.266 41.6 379. 9.29 1.86
4 PR 22 7 8 9 70 67 7
5 PR 20 6 7 8 80 75 7
6 KR 22 2.84 0.177 7 60 4.65 0.836
7 KR 20 2.37 0.177 8 55 6.04 0.836
8 KR 18 2.37 0.177 8 50 5.11 0.836
I have the following extract of my dataset about the occupancy of a football match:
example <- data.frame(Date <- c("2019-03-21", "2019-03-30", "2019-04-07",
"2019-03-21", "2019-03-30", "2019-04-07",
"2019-03-21", "2019-04-07",
"2019-03-21", "2019-03-30", "2019-04-07",
"2019-03-21", "2019-03-30", "2019-04-07",
"2019-03-21", "2019-03-30", "2019-04-07",
"2019-03-21", "2019-03-30", "2019-04-07",
"2019-03-21", "2019-03-30", "2019-04-07",
"2019-03-21", "2019-03-30",
"2019-03-21", "2019-03-30",
"2019-03-21", "2019-03-30",
"2019-03-21"),
Block <- c("43L","43L", "43L", "15B", "15B", "15B", "43L", "43L",
"15B", "15B", "15B",
"15B", "15B", "15B",
"15B", "15B", "15B",
"15B", "15B", "15B",
"15B", "15B", "15B",
"15B", "15B",
"15B", "15B",
"15B", "15B",
"15B"),
Preis <- as.numeric(c("24", "35", "30", "35", "45",
"40", "26", "30",
"35", "45", "40",
"34", "43", "42",
"35", "42", "45",
"36", "45", "43",
"36", "43", "40",
"35", "41",
"32", "42",
"30", "42",
"35")),
Max <- c("3", "3", "3", "10", "10","10","3", "3",
"10", "10","10",
"10", "10","10",
"10", "10","10",
"10", "10","10",
"10", "10","10",
"10", "10",
"10", "10",
"10", "10",
"10"),
Actual <- c("2", "1", "2", "10", "9", "6","2", "2",
"10", "9", "6",
"10", "9", "6",
"10", "9", "6",
"10", "9", "6",
"10", "9", "6",
"10", "9",
"10", "9",
"10", "9",
"10"),
Temperatur <- c("15", "20", "18","15", "20", "18", "15", "18",
"15", "20", "18",
"15", "20", "18",
"15", "20", "18",
"15", "20", "18",
"15", "20", "18",
"15", "20",
"15", "20",
"15", "20",
"15"),
Placesold <- c("1", "1", "1", "1", "1","1", "1", "1",
"1", "1", "1",
"1", "1", "1",
"1", "1", "1",
"1", "1", "1",
"1", "1", "1",
"1", "1",
"1", "1",
"1", "1",
"1") )
colnames(example) <- c("Date", "Block", "Price", "Max", "Actual", "Temprature", "Placesold")
In reality, the dataset contains over 100 blocks and 46 different dates.
If you take a closer look at the data, you can see that different numbers of seats are sold out in block 15B and 43L on different days.
table(example$Date, example$Block)
table(example$Placesold)
15B 43L
2019-03-21 10 2
2019-03-30 9 1
2019-4-07 6 2
> table(example$Placesold)
1
30
My goal is to add the seats that were not sold to the data set. The variable Placesold should be 0 instead of 1. In addition, the average price of the sold tickets should be used instead of the price (without 0).
To clarify my goal, I have added the missing rows for the reduced data set.
result <- data.frame(Date <- c("2019-03-21", "2019-03-30", "2019-4-07",
"2019-03-21", "2019-03-30", "2019-4-07",
"2019-03-21", "2019-03-30", "2019-4-07",
"2019-03-21", "2019-03-30", "2019-4-07",
"2019-03-21", "2019-03-30", "2019-4-07",
"2019-03-21", "2019-03-30", "2019-4-07",
"2019-03-21", "2019-03-30", "2019-4-07",
"2019-03-21", "2019-03-30", "2019-4-07",
"2019-03-21", "2019-03-30", "2019-4-07",
"2019-03-21", "2019-03-30", "2019-4-07",
"2019-03-21", "2019-03-30", "2019-4-07",
"2019-03-21", "2019-03-30", "2019-4-07",
"2019-03-21", "2019-03-30", "2019-4-07"),
Block <- c("43L","43L", "43L",
"15B", "15B", "15B",
"43L", "43L","43L",
"15B", "15B", "15B",
"43L", "43L","43L",
"15B", "15B", "15B",
"15B", "15B", "15B",
"15B", "15B", "15B",
"15B", "15B", "15B",
"15B", "15B", "15B",
"15B", "15B", "15B",
"15B", "15B", "15B",
"15B", "15B", "15B"),
Preis <- c("24", "35", "30",
"35", "45", "40",
"26", "35","30",
"35", "45", "40",
"25", "35","30",
"34", "43", "42",
"35", "42", "45",
"36", "45", "43",
"36", "43", "40",
"35", "41", "41.67",
"32", "42", "41.67",
"30", "42", "41.67",
"35","43.11","41.67"),
Max <- c("3", "3", "3", "10", "10","10",
"3", "3", "3",
"10", "10","10",
"3", "3", "3",
"10", "10","10",
"10", "10","10",
"10", "10","10",
"10", "10","10",
"10", "10","10",
"10", "10","10",
"10", "10","10",
"10", "10","10"),
Actual <- c("2", "1", "2",
"10", "9", "6",
"2", "1","2",
"10", "9", "6",
"2", "1","2",
"10", "9", "6",
"10", "9", "6",
"10", "9", "6",
"10", "9", "6",
"10", "9", "6",
"10", "9", "6",
"10", "9", "6",
"10", "9", "6"),
Temperatur <- c("15", "20", "18",
"15", "20", "18",
"15", "20", "18",
"15", "20", "18",
"15", "20", "18",
"15", "20", "18",
"15", "20", "18",
"15", "20", "18",
"15", "20", "18",
"15", "20", "18",
"15", "20", "18",
"15", "20", "18",
"15", "20", "18"),
Placesold <- c("1", "1", "1", "1", "1","1",
"1", "0", "1",
"1", "1", "1",
"0", "0", "0",
"1", "1", "1",
"1", "1", "1",
"1", "1", "1",
"1", "1", "1",
"1", "1", "0",
"1", "1", "0",
"1", "1", "0",
"1", "0", "0") )
colnames(result) <- c("Date", "Block", "Price", "Max", "Actual", "Temprature", "Placesold")
The results of the blocks and the data as well as the occurrence of the variable "Placesold" look like this:
table(result$Date, result$Block)
table(result$Placesold)
15B 43L
2019-03-21 10 3
2019-03-30 10 3
2019-4-07 10 3
> table(result$Placesold)
0 1
9 30
My first thought was to create a matrix with more rows, but to be honest I don't really know how. I hope you can help me.
Thank you very much.
I use dplyr functions and base::merge. merge can perform cross join between data frames, vectors and other types.
Construction of each date and block pair - it includes unsold blocks of a date:
# ordered, unique vector of dates
dates <- example$Date %>% unique() %>% sort()
# ordered, unique vector of blocks
blocks <- example$Block %>% unique() %>% sort()
# insert dummy block to demonstrate effects of missing blocks
blocks <- c("11B", blocks)
# cross join of dates and blocks: each date paired with each block
# (it results a data.frame)
eachDateBlock <- merge(dates, blocks, by = NULL)
# merge generate x and y as names for the resulted data.frame
# I rename them as a preparation for left_join
eachDateBlock <- eachDateBlock %>% rename(Date = x, Block = y)
# rows from 'eachDateBlock' with matchig row in 'example' get values of variables,
# otherwise they filled by NAs
extendedData <- eachDateBlock %>%
left_join(example, by = c("Date" = "Date", "Block" = "Block"))
# NOTE: before avgPrice you need something similar conversion - I ignore
# other numeric columns here
#example$Price <- as.double(example$Price)
#example$Placesold <- as.double(example$Placesold)
# Overwrite NAs in rows of supplied unsold blocks
avgPrice <- mean(example$Price)
result <- extendedData %>% mutate(
Price = if_else(is.na(Price), avgPrice, Price),
Placesold = if_else(is.na(Placesold), 0, Placesold)
) %>% arrange(Date)
> table(result$Date, result$Block)
11B 15B 43L
2019-03-21 1 10 2
2019-03-30 1 9 1
2019-04-07 1 6 2
> table(result$Placesold)
0 1
3 30
> result
Date Block Price Max Actual Temprature Placesold
1 2019-03-21 11B 37.53333 <NA> <NA> <NA> 0
.
.
.
12 2019-03-21 43L 24.00000 3 2 15 1
13 2019-03-21 43L 26.00000 3 2 15 1
14 2019-03-30 11B 37.53333 <NA> <NA> <NA> 0
15 2019-03-30 15B 45.00000 10 9 20 1
.
.
.
24 2019-03-30 43L 35.00000 3 1 20 1
25 2019-04-07 11B 37.53333 <NA> <NA> <NA> 0
.
.
.
31 2019-04-07 15B 40.00000 10 6 18 1
32 2019-04-07 43L 30.00000 3 2 18 1
33 2019-04-07 43L 30.00000 3 2 18 1
I need to select a unique ID.x for each ID.y (forming unique pairs) that minimizes a distance value, starting from the lowest distance values. I feel like it's a bit like a sudoku puzzle because each x and y can only be used once, so information from each pair allows for matching other pairs.
In the example below, ID.x 55 is a better match for ID.y 1 than ID.x 56 is, because ID.x 56 is a better match for ID.y 2. Similarly, ID.x 58 can be matched to ID.y 4, because any other available option would be a greater distance, and ID.y 5 can then take ID.x 59 at distance 4. However, ID.y 7 cannot be matched because ID.x 61 and ID.x 62 are equally close.
Example:
DT = data.table(
ID.x = c("55", "55", "55", "55", "55", "55", "55", "56", "56", "56", "56", "56", "56", "56", "57", "57", "57", "57", "57", "57", "57", "58", "58", "58", "58", "58", "58", "58", "59", "59", "59", "59", "59", "59", "59", "60", "60", "60", "60", "60", "60", "60", "61", "61", "61", "61", "61", "61", "61", "62", "62", "62", "62", "62", "62", "62"),
ID.y = c("1", "2", "3", "4", "5", "6", "7", "1", "2", "3", "4", "5", "6", "7", "1", "2", "3", "4", "5", "6", "7", "1", "2", "3", "4", "5", "6", "7", "1", "2", "3", "4", "5", "6", "7", "1", "2", "3", "4", "5", "6", "7", "1", "2", "3", "4", "5", "6", "7", "1", "2", "3", "4", "5", "6", "7"),
distance = c("2", "3", "3", "4", "6", "6", "7", "2", "1", "2", "5", "5", "5", "6", "4", "4", "3", "5", "5", "5", "6", "5", "5", "5", "4", "4", "5", "6", "7", "7", "7", "6", "4", "6", "7", "6", "6", "6", "6", "4", "2", "5", "7", "7", "7", "7", "5", "5", "5", "6", "6", "6", "6", "4", "4", "5")
)
Goal:
ID.x ID.y distance
1: 55 1 2
2: 56 2 1
3: 57 3 3
4: 58 4 4
5: 59 5 4
6: 60 6 2
7: NA 7 NA
This first attempt, inspired by this question, does not work:
DT[DT[, .I[distance == min(distance)], by=ID.x]$V1][DT[, .I[1], by = ID.y]$V1]
UPDATE:
In response to the answers by #chinsoon12 and #paweł-chabros, here is an updated data.table that fixes a few things. It swaps x and y (my original question was matching x's with y's, but the more natural interpretation is y with x). This example removes the ambiguous matching for ID.y 7. In this example, the lowest distance matches ID.x 63. Separately, I also added a new ID.y 8, to clarify when no unambiguous match is possible (it matches ID.x 64 and 65 equally well). The answer should not select a match arbitrarily.
DT = data.table(
ID.y = c("55", "55", "55", "55", "55", "55", "55", "55", "56", "56", "56", "56", "56", "56", "56", "56", "57", "57", "57", "57", "57", "57", "57", "57", "58", "58", "58", "58", "58", "58", "58", "58", "59", "59", "59", "59", "59", "59", "59", "59", "60", "60", "60", "60", "60", "60", "60", "60", "61", "61", "61", "61", "61", "61", "61", "61", "62", "62", "62", "62", "62", "62", "62", "62", "63", "63", "63", "63", "63", "63", "63", "63", "64", "64", "64", "64", "64", "64", "64", "64", "65", "65", "65", "65", "65", "65", "65", "65"),
ID.x = c("1", "2", "3", "4", "5", "6", "7", "8", "1", "2", "3", "4", "5", "6", "7", "8", "1", "2", "3", "4", "5", "6", "7", "8", "1", "2", "3", "4", "5", "6", "7", "8", "1", "2", "3", "4", "5", "6", "7", "8", "1", "2", "3", "4", "5", "6", "7", "8", "1", "2", "3", "4", "5", "6", "7", "8", "1", "2", "3", "4", "5", "6", "7", "8", "1", "2", "3", "4", "5", "6", "7", "8", "1", "2", "3", "4", "5", "6", "7", "8", "1", "2", "3", "4", "5", "6", "7", "8"),
distance = c(2, 3, 3, 4, 6, 6, 7, 15, 2, 1, 2, 5, 5, 5, 6, 15, 4, 4, 3, 5, 5, 5, 6, 15, 5, 5, 5, 4, 4, 5, 6, 15, 7, 7, 7, 6, 4, 6, 7, 15, 6, 6, 6, 6, 4, 2, 5, 15, 7, 7, 7, 7, 5, 5, 6, 15, 6, 6, 6, 6, 4, 4, 10, 15, 11, 11, 11, 11, 11, 11, 5, 12, 11, 11, 11, 11, 11, 11, 11, 1, 11, 11, 11, 11, 11, 11, 11, 1)
)
Expected Result:
ID.y ID.x distance
1: 55 1 2
2: 56 2 1
3: 57 3 3
4: 58 4 4
5: 59 5 4
6: 60 6 2
7: 63 7 5
8: NA 8 NA
I'm using this code is to complete a fuzzy join using stringdist_join, as described in this question. I have two datasets that need matching (hence the ID.x and ID.y). In my case, I have pre-test and post-test scores that need to be matched by multiple unreliable characteristics.
Not clear to me why why ID.x 62 and ID.y 7 distance 5 is not feasible.
Assuming that ID.x 62, ID.y 7 and distance 5 is acceptable, a possible approach using data.table:
setorder(DT, distance)
choseny <- c()
ans <- DT[,
{
y <- setdiff(ID.y, choseny)[1L]
choseny <- c(choseny, y)
.(ID.y=y, dist=.SD[ID.y==y, distance[1L]])
},
by=.(ID.x)]
setorder(ans, ID.x)[]
output:
ID.x ID.y dist
1: 55 1 2
2: 56 2 1
3: 57 3 3
4: 58 4 4
5: 59 5 4
6: 60 6 2
7: 61 <NA> <NA>
8: 62 7 5
I am not sure if that's really the desired solution, but it should be helpful. Not super elegant, but it pretty much looks like the desired output.
DT[, .(ID.y
, distance
, Row.Num = rank(distance)
, Row.Num.ID = rank(ID.y)), by = list(ID.x)][, .SD[Row.Num == min(Row.Num) ], by = ID.x][, .SD[Row.Num.ID == min(Row.Num.ID) ], by = ID.x]
> ID.x ID.y distance Row.Num Row.Num.ID
1: 55 1 2 1.0 1
2: 56 2 1 1.0 2
3: 57 3 3 1.0 3
4: 58 4 4 1.5 4
5: 59 5 4 1.0 5
6: 60 6 2 1.0 6
7: 61 5 5 2.0 5
8: 62 5 4 1.5 5
I don't know data.table well so I can give you only tidyverse solution. But maybe it will help you :)
library(tidyverse)
ID_y <- unique(DT$ID.y)
DT %>%
as_tibble() %>%
group_by(ID.x) %>%
mutate(min_dist = min(distance)) %>%
arrange(min_dist) %>%
nest() %>%
mutate(data = data %>% map(~ {
min_row <- .x %>%
filter(ID.y %in% ID_y) %>%
filter(distance == min(distance)) %>%
slice(1)
ID_y <<- ID_y[ID_y != min_row$ID.y]
min_row
})) %>%
unnest() %>%
select(-min_dist) %>%
arrange(ID.x)
I am saving all unique values of ID.y. Then I calculate minimum distance for all combinations and arrange by this minimum distance to tackle those ones at first in map loop. After filtering the minimum distance I remove ID.y from the vector, so other ID.x are searching only in ID.y's that left.
I have a data frame, the rows of which I would like to sort based on time stamp.
V1 V2 V3 V4 V5 V6
1 {"2014-08-01T01:00:00": "64", "2014-08-01T13:00:00": "53", "2014-08-01T01:20:00": "73",
2 {"2014-08-02T18:00:00": "37", "2014-08-02T22:00:00": "56", "2014-08-02T17:00:00": "24",
3 {"2014-08-03T17:50:00": "78", "2014-08-03T04:20:00": "83", "2014-08-03T00:20:00": "73",
4 {"2014-08-04T15:00:00": "37", "2014-08-04T21:00:00": "39", "2014-08-04T15:20:00": "43",
5 {"2014-08-05T19:20:00": "78", "2014-08-05T13:20:00": "46", "2014-08-05T00:00:00": "62",
6 {"2014-08-06T11:00:00": "45", "2014-08-06T09:00:00": "56", "2014-08-06T21:50:00": "68",
V7 V8 V9 V10 V11 V12
1 "2014-08-01T13:20:00": "57", "2014-08-01T13:50:00": "47", "2014-08-01T20:50:00": "44",
2 "2014-08-02T01:00:00": "56", "2014-08-02T17:20:00": "42", "2014-08-02T01:20:00": "68",
3 "2014-08-03T23:00:00": "81", "2014-08-03T00:00:00": "63", "2014-08-03T00:50:00": "73",
4 "2014-08-04T02:00:00": "81", "2014-08-04T18:00:00": "29", "2014-08-04T02:20:00": "88",
5 "2014-08-05T00:20:00": "72", "2014-08-05T00:50:00": "77", "2014-08-05T19:00:00": "75",
6 "2014-08-06T14:20:00": "53", "2014-08-06T14:00:00": "40", "2014-08-06T23:20:00": "77",
Desired out
The output of only one row is shown below.
{"2014-08-01T01:00:00": "64", "2014-08-01T01:20:00": "73", "2014-08-01T13:00:00": "53", "2014-08-01T13:20:00": "57", "2014-08-01T13:50:00": "47", "2014-08-01T20:50:00": "44",
We convert the datetime columns (df2[c(TRUE, FALSE)]- we are subsetting by recycling the logical vector) to POSIXct class by looping through the columns with lapply, then order by row using apply with MARGIN=1 ('m1'). We split the time columns and the value columns by row to create two lists 'l1', 'l2', then use Map with paste to concatenate the string together after we order the elements based on 'm1'. This can be converted to data.frame with one column.
df2[c(TRUE, FALSE)] <- lapply(df1[c(TRUE, FALSE)], function(x) as.POSIXct(sub('[{]', '', x), format = '%Y-%m-%dT%H:%M:%S:'))
m1 <- apply(df2[c(TRUE, FALSE)], 1, order)
l1 <- split(as.matrix(df1[c(TRUE, FALSE)]), row(df1[c(TRUE, FALSE)]))
l2 <- split(as.matrix(df2[c(FALSE, TRUE)]), row(df2[c( FALSE, TRUE)]))
data.frame(col1=unlist(Map(function(x,y,z) paste0('{',
paste(gsub('^\\{*(\\d+.*)(\\:)', '"\\1"\\2', x[z]),
gsub('(\\d+)', '"\\1"', y[z]), sep=' ', collapse=' ')),
l1, l2, split(m1, col(m1)))), stringsAsFactors=FALSE)
col1
#1 {"2014-08-01T01:00:00": "64", "2014-08-01T01:20:00": "73", "2014-08-01T13:00:00": "53", "2014-08-01T13:20:00": "57", "2014-08-01T13:50:00": "47", "2014-08-01T20:50:00": "44",
#2 {"2014-08-02T01:00:00": "56", "2014-08-02T01:20:00": "68", "2014-08-02T17:00:00": "24", "2014-08-02T17:20:00": "42", "2014-08-02T18:00:00": "37", "2014-08-02T22:00:00": "56",
#3 {"2014-08-03T00:00:00": "63", "2014-08-03T00:20:00": "73", "2014-08-03T00:50:00": "73", "2014-08-03T04:20:00": "83", "2014-08-03T17:50:00": "78", "2014-08-03T23:00:00": "81",
#4 {"2014-08-04T02:00:00": "81", "2014-08-04T02:20:00": "88", "2014-08-04T15:00:00": "37", "2014-08-04T15:20:00": "43", "2014-08-04T18:00:00": "29", "2014-08-04T21:00:00": "39",
#5 {"2014-08-05T00:00:00": "62", "2014-08-05T00:20:00": "72", "2014-08-05T00:50:00": "77", "2014-08-05T13:20:00": "46", "2014-08-05T19:00:00": "75", "2014-08-05T19:20:00": "78",
#6 {"2014-08-06T09:00:00": "56", "2014-08-06T11:00:00": "45", "2014-08-06T14:00:00": "40", "2014-08-06T14:20:00": "53", "2014-08-06T21:50:00": "68", "2014-08-06T23:20:00": "77",
data
lines <- readLines(textConnection('V1 V2 V3 V4 V5 V6
1 {"2014-08-01T01:00:00": "64", "2014-08-01T13:00:00": "53", "2014-08-01T01:20:00": "73",
2 {"2014-08-02T18:00:00": "37", "2014-08-02T22:00:00": "56", "2014-08-02T17:00:00": "24",
3 {"2014-08-03T17:50:00": "78", "2014-08-03T04:20:00": "83", "2014-08-03T00:20:00": "73",
4 {"2014-08-04T15:00:00": "37", "2014-08-04T21:00:00": "39", "2014-08-04T15:20:00": "43",
5 {"2014-08-05T19:20:00": "78", "2014-08-05T13:20:00": "46", "2014-08-05T00:00:00": "62",
6 {"2014-08-06T11:00:00": "45", "2014-08-06T09:00:00": "56", "2014-08-06T21:50:00": "68",'))
lines2 <- readLines(textConnection('V7 V8 V9 V10 V11 V12
1 "2014-08-01T13:20:00": "57", "2014-08-01T13:50:00": "47", "2014-08-01T20:50:00": "44",
2 "2014-08-02T01:00:00": "56", "2014-08-02T17:20:00": "42", "2014-08-02T01:20:00": "68",
3 "2014-08-03T23:00:00": "81", "2014-08-03T00:00:00": "63", "2014-08-03T00:50:00": "73",
4 "2014-08-04T02:00:00": "81", "2014-08-04T18:00:00": "29", "2014-08-04T02:20:00": "88",
5 "2014-08-05T00:20:00": "72", "2014-08-05T00:50:00": "77", "2014-08-05T19:00:00": "75",
6 "2014-08-06T14:20:00": "53", "2014-08-06T14:00:00": "40", "2014-08-06T23:20:00": "77",'))
d1 <- read.table(text=gsub('^\\d+\\s+|"', '', lines), header=TRUE, stringsAsFactors=FALSE)
d2 <- read.table(text=gsub('^\\d+\\s+|"', '', lines2), header=TRUE, stringsAsFactors=FALSE)
df1 <- cbind(d1, d2)
df2 <- df1
I can apply the following code to apply a pairwiset.t.test function on each column of a certain dataframe, A, against a dependent parameter, DEP.
apply(A, 2, function(x) pairwise.t.test(DEP, x, p.adj = 'bonferroni')$p.value)
The output for this function results in a list type data. Something like the following shows:
$A
1 2 3 4
1 0.7236751 NA NA NA
2 0.7236751 0.7236751 NA NA
3 0.7236751 0.7236751 0.7236751 NA
4 0.7236751 0.7236751 0.7236751 0.7236751
$B
0
1 0.7236751
I would like to ask if there's a way to combine the two list mode data such that:
1 2 3 4 5
1 0.7236751 NA NA NA 0.7236751
2 0.7236751 0.7236751 NA NA NA
3 0.7236751 0.7236751 0.7236751 NA NA
4 0.7236751 0.7236751 0.7236751 0.7236751 NA
The output of dput(apply(A, 2, function(x) pairwise.t.test(DEP, x, p.adj = 'bonferroni')$p.value))
is as follows:
Dimnames = list(c("10", "11", "14", "15", "16", "17",
"18", "19", "20", "21", "23", "31", "33", "34", "36", "39", "40",
"42", "43", "45", "46", "47", "49", "50", "51", "52", "54", "59",
"60", "63", "64", "66", "70", "73"), c("6", "10", "11", "14",
"15", "16", "17", "18", "19", "20", "21", "23", "31", "33", "34",
"36", "39", "40", "42", "43", "45", "46", "47", "49", "50", "51",
"52", "54", "59", "60", "63", "64", "66", "70"))), Gender = structure(0.723675133647025, .Dim = c(1L,
1L), .Dimnames = list("1", "0"))), .Names = c("A", "B"
))