comparing two meta data and their variables and options - r

I am validating two data frames if they are consistent, its working on small dataframes perfectly but when records of data frame increases then it shows error
library(tidyverse)
df1 <- data.frame(MAN=c(6,6,4,6,8,6,8,4,4,6,6,8,8),MANi=c("OD","NY","CA","CA","OD","CA","OD","NY","OL","NY","OD","CA","OD"),
nune=c("akas","mani","juna","mau","nuh","kil","kman","nuha","huna","kman","nuha","huna","mani"),
klay=c(1,2,2,1,1,2,1,2,1,2,1,1,2),emial=c("dd","xyz","abc","dd","xyz","abc","dd","xyz","abc","dd","xyz","abc","dd"),Pass=c("Low","High","Low","Low","High","Low","High","High","Low","High","High","High","Low"),fri=c("KKK","USA","IND","SRI","PAK","CHI","JYP","TGA","KKK","USA","IND","SRI","PAK"),
mkl=c("m","f","m","m","f","m","m","f","m","m","f","m","m"),kin=c("Sent","Rec","Sent","Rec","Sent","Rec","Sent","Rec","Sent","Rec","Rec","Sent","Rec"),munc=c("Car","Bus","Truk","Cyl","Bus","Car","Bus","Bus","Bus","Car","Car","Cyl","Car"),
lone=c("Sr","jun","sr","jun","man","man","jr","Sr","jun","sr","jun","man","man"),wond=c("tko","kent","bho","kilt","kent","bho","kent","bho","bho","kilt","kent","bho","kilt"))
df2 <- data.frame(MAN=c(6,6,4,6,8,6,8,4,4,6,6,8,8,8,6),MANi=c("OD","NY","CA","CA","OD","CA","OD","NY","OL","ny","OD","CA","OD","NY","OL"),
nune=c("akas","mani","juna","mau","nuh","kil","kman","nuha","huna","kman","nuha","huna","mani","juna","mau"),
klay=c(1,2,2,1,1,2,1,2,1,2,1,1,2,2,1),emial=c("dd","xyz","ABC","dd","xyz","ABC","dd","xyz","ABC","dd","xyz","ABC","dd","xyz","ABC"),Pass=c("Low","High","Low","Low","High","Low","High","High","Low","High","High","High","Low","High","High"),fri=c("KKK","USA","IND","SRI","PAK","CHI","JYP","TGA","KKK","USA","IND","SRI","PAK","CHI","JYP"),
mkl=c("male","female","male","male","female","male","male","female","male","male","female","male","male","female","male"),kin=c("Sent","Rec","Sent","Rec","Sent","Rec","Sent","Rec","Sent","Rec","Rec","Sent","Rec","Sent","Rec"),munc=c("Car","Bus","Truk","Cyl","Bus","Car","Bus","Bus","Bus","Car","Car","Cyl","Car","Bus","Bus"),
lone=c("Sr","jun","sr","jun","man","man","jr","Sr","jun","sr","jun","man","man","jr","man"),wond=c("tko","kent","bho","kilt","kent","bho","kent","bho","bho","kilt","kent","bho","kilt","kent","bho"))

Worth considering waldo::compare?
df1 <- data.frame(MAN=c(6,6,4,6,8,6,8,4,4,6,6,8,8),MANi=c("OD","NY","CA","CA","OD","CA","OD","NY","OL","NY","OD","CA","OD"),
nune=c("akas","mani","juna","mau","nuh","kil","kman","nuha","huna","kman","nuha","huna","mani"),
klay=c(1,2,2,1,1,2,1,2,1,2,1,1,2),emial=c("dd","xyz","abc","dd","xyz","abc","dd","xyz","abc","dd","xyz","abc","dd"),Pass=c("Low","High","Low","Low","High","Low","High","High","Low","High","High","High","Low"),fri=c("KKK","USA","IND","SRI","PAK","CHI","JYP","TGA","KKK","USA","IND","SRI","PAK"),
mkl=c("m","f","m","m","f","m","m","f","m","m","f","m","m"),kin=c("Sent","Rec","Sent","Rec","Sent","Rec","Sent","Rec","Sent","Rec","Rec","Sent","Rec"),munc=c("Car","Bus","Truk","Cyl","Bus","Car","Bus","Bus","Bus","Car","Car","Cyl","Car"),
lone=c("Sr","jun","sr","jun","man","man","jr","Sr","jun","sr","jun","man","man"),wond=c("tko","kent","bho","kilt","kent","bho","kent","bho","bho","kilt","kent","bho","kilt"))
df2 <- data.frame(MAN=c(6,6,4,6,8,6,8,4,4,6,6,8,8,8,6),MANi=c("OD","NY","CA","CA","OD","CA","OD","NY","OL","ny","OD","CA","OD","NY","OL"),
nune=c("akas","mani","juna","mau","nuh","kil","kman","nuha","huna","kman","nuha","huna","mani","juna","mau"),
klay=c(1,2,2,1,1,2,1,2,1,2,1,1,2,2,1),emial=c("dd","xyz","ABC","dd","xyz","ABC","dd","xyz","ABC","dd","xyz","ABC","dd","xyz","ABC"),Pass=c("Low","High","Low","Low","High","Low","High","High","Low","High","High","High","Low","High","High"),fri=c("KKK","USA","IND","SRI","PAK","CHI","JYP","TGA","KKK","USA","IND","SRI","PAK","CHI","JYP"),
mkl=c("male","female","male","male","female","male","male","female","male","male","female","male","male","female","male"),kin=c("Sent","Rec","Sent","Rec","Sent","Rec","Sent","Rec","Sent","Rec","Rec","Sent","Rec","Sent","Rec"),munc=c("Car","Bus","Truk","Cyl","Bus","Car","Bus","Bus","Bus","Car","Car","Cyl","Car","Bus","Bus"),
lone=c("Sr","jun","sr","jun","man","man","jr","Sr","jun","sr","jun","man","man","jr","man"),wond=c("tko","kent","bho","kilt","kent","bho","kent","bho","bho","kilt","kent","bho","kilt","kent","bho"))
waldo::compare(df1, df2)
#> `attr(old, 'row.names')[11:13]`: 11 12 13
#> `attr(new, 'row.names')[11:15]`: 11 12 13 14 15
#>
#> old vs new
#> MAN MANi nune klay emial Pass fri mkl kin munc lone wond
#> - old[1, ] 6 OD akas 1 dd Low KKK m Sent Car Sr tko
#> + new[1, ] 6 OD akas 1 dd Low KKK male Sent Car Sr tko
#> - old[2, ] 6 NY mani 2 xyz High USA f Rec Bus jun kent
#> + new[2, ] 6 NY mani 2 xyz High USA female Rec Bus jun kent
#> - old[3, ] 4 CA juna 2 abc Low IND m Sent Truk sr bho
#> + new[3, ] 4 CA juna 2 ABC Low IND male Sent Truk sr bho
#> - old[4, ] 6 CA mau 1 dd Low SRI m Rec Cyl jun kilt
#> + new[4, ] 6 CA mau 1 dd Low SRI male Rec Cyl jun kilt
#> - old[5, ] 8 OD nuh 1 xyz High PAK f Sent Bus man kent
#> + new[5, ] 8 OD nuh 1 xyz High PAK female Sent Bus man kent
#> - old[6, ] 6 CA kil 2 abc Low CHI m Rec Car man bho
#> + new[6, ] 6 CA kil 2 ABC Low CHI male Rec Car man bho
#> - old[7, ] 8 OD kman 1 dd High JYP m Sent Bus jr kent
#> + new[7, ] 8 OD kman 1 dd High JYP male Sent Bus jr kent
#> - old[8, ] 4 NY nuha 2 xyz High TGA f Rec Bus Sr bho
#> + new[8, ] 4 NY nuha 2 xyz High TGA female Rec Bus Sr bho
#> - old[9, ] 4 OL huna 1 abc Low KKK m Sent Bus jun bho
#> + new[9, ] 4 OL huna 1 ABC Low KKK male Sent Bus jun bho
#> - old[10, ] 6 NY kman 2 dd High USA m Rec Car sr kilt
#> + new[10, ] 6 ny kman 2 dd High USA male Rec Car sr kilt
#> and 5 more ...
#>
#> `old$MAN[11:13]`: 6 8 8
#> `new$MAN[11:15]`: 6 8 8 8 6
#>
#> `old$MANi[10:13]`: "NY" "OD" "CA" "OD"
#> `new$MANi[7:15]`: "OD" "NY" "OL" "ny" "OD" "CA" "OD" "NY" "OL"
#>
#> `old$nune[11:13]`: "nuha" "huna" "mani"
#> `new$nune[11:15]`: "nuha" "huna" "mani" "juna" "mau"
#>
#> `old$klay[11:13]`: 1 1 2
#> `new$klay[11:15]`: 1 1 2 2 1
#>
#> old$emial | new$emial
#> [2] "xyz" - "dd" [1]
#> [3] "abc" - "xyz" [2]
#> [4] "dd" - "ABC" [3]
#> [5] "xyz" - "dd" [4]
#> [6] "abc" - "xyz" [5]
#> [7] "dd" - "ABC" [6]
#> [8] "xyz" - "dd" [7]
#> [9] "abc" - "xyz" [8]
#> [10] "dd" - "ABC" [9]
#> [11] "xyz" - "dd" [10]
#> ... ... ... and 5 more ...
#>
#> `old$Pass[11:13]`: "High" "High" "Low"
#> `new$Pass[11:15]`: "High" "High" "Low" "High" "High"
#>
#> `old$fri[11:13]`: "IND" "SRI" "PAK"
#> `new$fri[11:15]`: "IND" "SRI" "PAK" "CHI" "JYP"
#>
#> old$mkl | new$mkl
#> [1] "m" - "male" [1]
#> [2] "f" - "female" [2]
#> [3] "m" - "male" [3]
#> [4] "m" - "male" [4]
#> [5] "f" - "female" [5]
#> [6] "m" - "male" [6]
#> [7] "m" - "male" [7]
#> [8] "f" - "female" [8]
#> [9] "m" - "male" [9]
#> [10] "m" - "male" [10]
#> ... ... ... and 5 more ...
#>
#> And 4 more differences ...
Created on 2022-05-21 by the reprex package (v2.0.1)
Or the daff package for highlighted sortable / filterable differences:
library(daff)
diffs <- diff_data(df1, df2)
render_diff(diffs)

Related

How can I write a regex to order the paths of which I want to list them in numeric order

I have hundreds of .wav files and imported them using list.files. Something like above:
[1] "10/Project_English-10/BL-0001_A-conifer-cone-contains-seeds/Balsam-poplar-English-0701.wav"
[2] "10/Project_English-10/BL-0001_A-conifer-cone-contains-seeds/Birch-English-0700.wav"
[3] "10/Project_English-10/BL-0001_A-conifer-cone-contains-seeds/Blueberry-English-0703.wav"
.......
[73] "3/Project_English-3/BL-0002_Lesser-horseshoe-bat/Capercaillie-English-0069.wav"
[74] "3/Project_English-3/BL-0002_Lesser-horseshoe-bat/Fat-tail-scorpion-English-0082.wav"
[75] "3/Project_English-3/BL-0002_Lesser-horseshoe-bat/Fire-salamander-English-0067.wav"
I use the following code to reorder the file paths of which I want number in each subpath follows numberic order. I have tried the following
filename<- file_list[order(as.numeric(stringr::str_extract(file_list,"[0-9]+(.*?)")) )]
The result is something like:
[1] "3/Project_English-3/BL-0002_Lesser-horseshoe-bat/Capercaillie-English-0069.wav"
[2] "3/Project_English-3/BL-0002_Lesser-horseshoe-bat/Fat-tail-scorpion-English-0082.wav"
[3] "3/Project_English-3/BL-0002_Lesser-horseshoe-bat/Fire-salamander-English-0067.wav"
.......
[73] "10/Project_English-10/BL-0001_A-conifer-cone-contains-seeds/Balsam-poplar-English-0701.wav"
[74] "10/Project_English-10/BL-0001_A-conifer-cone-contains-seeds/Birch-English-0700.wav"
[75] "10/Project_English-10/BL-0001_A-conifer-cone-contains-seeds/Blueberry-English-0703.wav"
I also want the last subpath follows in numberic order, e.g. English-0067;English-0069. I tried to repeat the matching for the last subpath, but it will disorder the previous order followed by 3...10. How could I let all the numbers in the subpaths follows numberic order?
another option:
ord <- order(as.numeric(sub("(^\\d+)/.*$","\\1",files)), as.numeric(sub("^.*-(\\d+)\\.wav","\\1",files)))
files[ord]
#> [1] "3/Project_English-3/BL-0002_Lesser-horseshoe-bat/Fire-salamander-English-0067.wav"
#> [2] "3/Project_English-3/BL-0002_Lesser-horseshoe-bat/Capercaillie-English-0069.wav"
#> [3] "3/Project_English-3/BL-0002_Lesser-horseshoe-bat/Fat-tail-scorpion-English-0082.wav"
#> [4] "10/Project_English-10/BL-0001_A-conifer-cone-contains-seeds/Birch-English-0700.wav"
#> [5] "10/Project_English-10/BL-0001_A-conifer-cone-contains-seeds/Balsam-poplar-English-0701.wav"
#> [6] "10/Project_English-10/BL-0001_A-conifer-cone-contains-seeds/Blueberry-English-0703.wav"
Here's one approach:
vec <- c( "10/Project_English-10/BL-0001_A-conifer-cone-contains-seeds/Balsam-poplar-English-0701.wav",
"10/Project_English-10/BL-0001_A-conifer-cone-contains-seeds/Birch-English-0700.wav",
"10/Project_English-10/BL-0001_A-conifer-cone-contains-seeds/Blueberry-English-0703.wav",
"3/Project_English-3/BL-0002_Lesser-horseshoe-bat/Capercaillie-English-0069.wav",
"3/Project_English-3/BL-0002_Lesser-horseshoe-bat/Fat-tail-scorpion-English-0082.wav",
"3/Project_English-3/BL-0002_Lesser-horseshoe-bat/Fire-salamander-English-0067.wav")
nums <- strcapture("^([0-9]+).*\\b([0-9]+)\\.[a-z]+$", vec, proto=list(a=0L,b=0L))
nums
# a b
# 1 10 701
# 2 10 700
# 3 10 703
# 4 3 69
# 5 3 82
# 6 3 67
do.call(order, nums)
# [1] 6 4 5 2 1 3
vec[do.call(order, nums)]
# [1] "3/Project_English-3/BL-0002_Lesser-horseshoe-bat/Fire-salamander-English-0067.wav"
# [2] "3/Project_English-3/BL-0002_Lesser-horseshoe-bat/Capercaillie-English-0069.wav"
# [3] "3/Project_English-3/BL-0002_Lesser-horseshoe-bat/Fat-tail-scorpion-English-0082.wav"
# [4] "10/Project_English-10/BL-0001_A-conifer-cone-contains-seeds/Birch-English-0700.wav"
# [5] "10/Project_English-10/BL-0001_A-conifer-cone-contains-seeds/Balsam-poplar-English-0701.wav"
# [6] "10/Project_English-10/BL-0001_A-conifer-cone-contains-seeds/Blueberry-English-0703.wav"
If you needed to also include the BL-0001 in your ordering, all it would take is a small addition to the regex, an additional entry in proto=, and that's it. The use of do.call(order, nums) will handle 1 or more columns, regardless of how many.
Note that if you over-tune your regex, rows that don't match both groups here will return NA for both; this means it'll sort the NA rows last. If you find that one or more filenames are misordered, check the regex and the intermediate nums entries for those filenames.
A tidyverse solution: structuring data as a table and using stringr::str_detect() to arrange rows before extracting filenames.
vec <- c( "10/Project_English-10/BL-0001_A-conifer-cone-contains-seeds/Balsam-poplar-English-0701.wav",
"10/Project_English-10/BL-0001_A-conifer-cone-contains-seeds/Birch-English-0700.wav",
"10/Project_English-10/BL-0001_A-conifer-cone-contains-seeds/Blueberry-English-0703.wav",
"3/Project_English-3/BL-0002_Lesser-horseshoe-bat/Capercaillie-English-0069.wav",
"3/Project_English-3/BL-0002_Lesser-horseshoe-bat/Fat-tail-scorpion-English-0082.wav",
"3/Project_English-3/BL-0002_Lesser-horseshoe-bat/Fire-salamander-English-0067.wav")
library(dplyr)
library(stringr)
vec_tib <- tibble(filename = vec)
vec_tib <- mutate(vec_tib,
num_1 = str_extract(filename, "\\d+"),
num_2 = str_extract(filename, "\\d+(?=(\\.wav))"))
head(vec_tib, 3)
#> # A tibble: 3 × 3
#> filename num_1 num_2
#> <chr> <chr> <chr>
#> 1 10/Project_English-10/BL-0001_A-conifer-cone-contains-seeds/Balsa… 10 0701
#> 2 10/Project_English-10/BL-0001_A-conifer-cone-contains-seeds/Birch… 10 0700
#> 3 10/Project_English-10/BL-0001_A-conifer-cone-contains-seeds/Blueb… 10 0703
vec_tib <- mutate(vec_tib, across(starts_with("num"), as.numeric))
vec_tib |>
arrange(num_1, num_2) |>
pull(filename)
#> [1] "3/Project_English-3/BL-0002_Lesser-horseshoe-bat/Fire-salamander-English-0067.wav"
#> [2] "3/Project_English-3/BL-0002_Lesser-horseshoe-bat/Capercaillie-English-0069.wav"
#> [3] "3/Project_English-3/BL-0002_Lesser-horseshoe-bat/Fat-tail-scorpion-English-0082.wav"
#> [4] "10/Project_English-10/BL-0001_A-conifer-cone-contains-seeds/Birch-English-0700.wav"
#> [5] "10/Project_English-10/BL-0001_A-conifer-cone-contains-seeds/Balsam-poplar-English-0701.wav"
#> [6] "10/Project_English-10/BL-0001_A-conifer-cone-contains-seeds/Blueberry-English-0703.wav"
Created on 2022-11-28 with reprex v2.0.2

Reshape long to wide with dcast

I need to reshape a dataframe csf_antibio (sample below) from long to wide format, separating all values per patient by row.
study_id,proanbx_dt,proanbx_tm,name,othername,route,dosage,units,doses,freq
CHLA_0001,2021-07-22,20:01:00,ceftriaxone,,IV,1250,mg,4,13
CHLA_0001,2021-07-22,20:19:00,metronidazole,,IV,250,mg,5,9
CHLA_0001,2021-07-22,23:17:00,vancomycin,,IV,350,mg,3,6
CHLA_0001,2021-08-09,19:34:00,cefazolin,,IV,738,mg,1,8
CHLA_0002,2020-12-18,0:30:00,cefepime,,IV,75,mg,5,8
CHLA_0002,2020-12-18,1:03:00,vancomycin,,IV,23,mg,4,13
CHLA_0002,2020-12-19,18:15:00,cefepime,,IV,60,mg,6,8
CHLA_0002,2020-12-20,4:18:00,vancomycin,,IV,24,mg,4,12
CHLA_0003,2021-04-20,15:17:00,meropenem,,IV,200,mg,2,1
CHLA_0003,2021-04-21,2:20:00,meropenem,,IV,400,mg,17,8
CHLA_0003,2021-04-22,14:16:00,Other,sulfamethoxazole-trimethoprim,IV,50,mg,9,12
I tried the following without success:
csfmelt <- melt(csf_antibio, id.vars=1:1)
csf <- dcast(csfmelt, study_id ~ variable, value.var = "value", fun.aggregate = sum)
I want the final dataframe to have each study id per row with variables
study_id,proanbx_dt1,proanbx_tm1,name1,othername1,route1,dosage1,units1,doses1,freq1,proanbx_dt2,proanbx_tm2,name2,othername2,route2,dosage2,units2,doses2,freq2,proanbx_dt3,proanbx_tm3,name3,othername3,route3,dosage3,units3,doses3,freq3,proanbx_dt4,proanbx_tm4,name4,othername4,route4,dosage4,units4,doses4,freq4
CHLA_0001,2021-07-22,20:01:00,ceftriaxone,,IV,1250,mg,4,13, 2021-07-22,20:19:00,metronidazole,,IV,250,mg,5,9, 2021-07-22,23:17:00,vancomycin,,IV,350,mg,3,6,2021-08-09,19:34:00,cefazolin,,IV,738,mg,1,8
CHLA_0002,2020-12-18,0:30:00,cefepime,,IV,75,mg,5,8,2020-12-18,1:03:00,vancomycin,,IV,23,mg,4,13,2020-12-19,18:15:00,cefepime,,IV,60,mg,6,8,2020-12-20,4:18:00,vancomycin,,IV,24,mg,4,12,2021-04-20,15:17:00,meropenem,,IV,200,mg,2,1,2021-04-21,2:20:00,meropenem,,IV,400,mg,17,8,2021-04-22,14:16:00,Other,sulfamethoxazole-trimethoprim,IV,50,mg,9,12
Thanks in advance!
Your desired output has a "number" component that is not naturally inferred by dcast. We can add it relatively easily with ave (base R, certainly this can be done just as easily in data.table or dplyr groupings).
reshape2 and base R
csfmelt$num <- ave(seq(nrow(csfmelt)), csfmelt[c("study_id","variable")], FUN = seq_along)
head(csfmelt)
# study_id variable value num
# 1 CHLA_0001 proanbx_dt 2021-07-22 1
# 2 CHLA_0001 proanbx_dt 2021-07-22 2
# 3 CHLA_0001 proanbx_dt 2021-07-22 3
# 4 CHLA_0001 proanbx_dt 2021-08-09 4
# 5 CHLA_0002 proanbx_dt 2020-12-18 1
# 6 CHLA_0002 proanbx_dt 2020-12-18 2
csfwide <- reshape2::dcast(csfmelt, study_id ~ variable + num, value.var = "value")
csfwide
# study_id proanbx_dt_1 proanbx_dt_2 proanbx_dt_3 proanbx_dt_4 proanbx_tm_1 proanbx_tm_2 proanbx_tm_3 proanbx_tm_4 name_1 name_2 name_3 name_4 othername_1 othername_2 othername_3 othername_4 route_1 route_2 route_3 route_4 dosage_1 dosage_2 dosage_3 dosage_4 units_1 units_2 units_3 units_4 doses_1 doses_2 doses_3 doses_4 freq_1 freq_2 freq_3 freq_4
# 1 CHLA_0001 2021-07-22 2021-07-22 2021-07-22 2021-08-09 20:01:00 20:19:00 23:17:00 19:34:00 ceftriaxone metronidazole vancomycin cefazolin IV IV IV IV 1250 250 350 738 mg mg mg mg 4 5 3 1 13 9 6 8
# 2 CHLA_0002 2020-12-18 2020-12-18 2020-12-19 2020-12-20 0:30:00 1:03:00 18:15:00 4:18:00 cefepime vancomycin cefepime vancomycin IV IV IV IV 75 23 60 24 mg mg mg mg 5 4 6 4 8 13 8 12
# 3 CHLA_0003 2021-04-20 2021-04-21 2021-04-22 <NA> 15:17:00 2:20:00 14:16:00 <NA> meropenem meropenem Other <NA> sulfamethoxazole-trimethoprim <NA> IV IV IV <NA> 200 400 50 <NA> mg mg mg <NA> 2 17 9 <NA> 1 8 12 <NA>
The column order is not what you requested, but it can be conformed a bit with this:
variables <- as.character(unique(csfmelt$variable))
sub(".*_", "", names(csfwide)[-(1:2)])
# [1] "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4" "1" "2" "3" "4"
sub("_[^_]*$", "", names(csfwide)[-(1:2)])
# [1] "proanbx_dt" "proanbx_dt" "proanbx_dt" "proanbx_tm" "proanbx_tm" "proanbx_tm" "proanbx_tm" "name" "name" "name" "name" "othername" "othername" "othername" "othername" "route" "route" "route" "route" "dosage"
# [21] "dosage" "dosage" "dosage" "units" "units" "units" "units" "doses" "doses" "doses" "doses" "freq" "freq" "freq" "freq"
nms <- names(csfwide)[-(1:2)]
newnms <- nms[order(sub(".*_", "", nms), match(nms, variables))]
csfwide2 <- subset(csfwide, select = c(names(csfwide)[1:2], newnms))
csfwide2
# study_id proanbx_dt_1 proanbx_tm_1 name_1 othername_1 route_1 dosage_1 units_1 doses_1 freq_1 proanbx_dt_2 proanbx_tm_2 name_2 othername_2 route_2 dosage_2 units_2 doses_2 freq_2 proanbx_dt_3 proanbx_tm_3 name_3 othername_3 route_3 dosage_3 units_3 doses_3 freq_3 proanbx_dt_4 proanbx_tm_4 name_4 othername_4 route_4 dosage_4 units_4 doses_4 freq_4
# 1 CHLA_0001 2021-07-22 20:01:00 ceftriaxone IV 1250 mg 4 13 2021-07-22 20:19:00 metronidazole IV 250 mg 5 9 2021-07-22 23:17:00 vancomycin IV 350 mg 3 6 2021-08-09 19:34:00 cefazolin IV 738 mg 1 8
# 2 CHLA_0002 2020-12-18 0:30:00 cefepime IV 75 mg 5 8 2020-12-18 1:03:00 vancomycin IV 23 mg 4 13 2020-12-19 18:15:00 cefepime IV 60 mg 6 8 2020-12-20 4:18:00 vancomycin IV 24 mg 4 12
# 3 CHLA_0003 2021-04-20 15:17:00 meropenem IV 200 mg 2 1 2021-04-21 2:20:00 meropenem IV 400 mg 17 8 2021-04-22 14:16:00 Other sulfamethoxazole-trimethoprim IV 50 mg 9 12 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
#r2evans gave you a great answer, but I was thinking about your comments regarding dates and time. You didn't provide how you collected this data, so I can't tell you how to import it this way. However, I did convert these variables in the following code. That being said, adding dates isn't meaningful. I was thinking that the number of days and the amount of time passed might be more along the lines of what you were looking for for those particular variables. Unfortunately, I wasn't able to figure out how to make it work with reshape2. This uses dplyr, tidyselect and hms. Although, you would only have to call dplyr, because I've appended the packages for the applicable functions. (You need the packages installed, though.)
I didn't keep the name and othername because it's not multiple entries.
library(dplyr)
csf_antibio = read.table(header = T, sep = ",", text = "study_id,proanbx_dt,proanbx_tm,name,othername,route,dosage,units,doses,freq
CHLA_0001,2021-07-22,20:01:00,ceftriaxone,,IV,1250,mg,4,13
CHLA_0001,2021-07-22,20:19:00,metronidazole,,IV,250,mg,5,9
CHLA_0001,2021-07-22,23:17:00,vancomycin,,IV,350,mg,3,6
CHLA_0001,2021-08-09,19:34:00,cefazolin,,IV,738,mg,1,8
CHLA_0002,2020-12-18,0:30:00,cefepime,,IV,75,mg,5,8
CHLA_0002,2020-12-18,1:03:00,vancomycin,,IV,23,mg,4,13
CHLA_0002,2020-12-19,18:15:00,cefepime,,IV,60,mg,6,8
CHLA_0002,2020-12-20,4:18:00,vancomycin,,IV,24,mg,4,12
CHLA_0003,2021-04-20,15:17:00,meropenem,,IV,200,mg,2,1
CHLA_0003,2021-04-21,2:20:00,meropenem,,IV,400,mg,17,8
CHLA_0003,2021-04-22,14:16:00,Other,sulfamethoxazole-trimethoprim,IV,50,mg,9,12")
Because the time is truly linked to the date, I wrote a function to process the time difference.
timer <- function(df1){
maxtm = max(df1$proanbx_tm[df1$proanbx_dt == max(df1$proanbx_dt)]) %>% hms::as_hms()
mintm = min(df1$proanbx_tm[df1$proanbx_dt == min(df1$proanbx_dt)]) %>% hms::as_hms()
if(maxtm > mintm){
tmr = (maxtm - mintm) %>% hms::as_hms() # captures mult entries in the same day
} else if(mintm > maxtm) {
tmr = (maxtm - mintm) + hms::as_hms('24:00:00') # add a full day
} else { # only one entry or the time is identical in max/min
tmr = hms::as_hms('0')
}
return(tmr)
}
I collected the column names to return the columns to the original order.
ordNames = names(csf_antibio) # collect names to return order to columns
# [1] "study_id" "proanbx_dt" "proanbx_tm" "name" "othername" "route"
# [7] "dosage" "units" "doses" "freq"
# names kept = ordNames[,c(1:3,6:10)]
Find the sums and differences in time
csf2 <- csf_antibio %>%
mutate(proanbx_dt = as.Date(proanbx_dt), # convert to date
proanbx_tm = hms::as_hms(proanbx_tm)) %>% # convert to time
group_by(study_id) %>% # group by study
summarise(proanbx_tm = timer(.data), # difference in time
proanbx_dt = max(proanbx_dt) - min(proanbx_dt), # difference in days
across(tidyselect:::where(is.integer), sum),
units = "mg",
route = "IV") %>%
select(ordNames[c(1:3,6:10)])
head(csf2)
# # A tibble: 3 × 8
# study_id proanbx_dt proanbx_tm route dosage units doses freq
# <chr> <drtn> <time> <chr> <int> <chr> <int> <int>
# 1 CHLA_0001 18 days 23:33 IV 2588 mg 13 36
# 2 CHLA_0002 2 days 03:48 IV 182 mg 19 41
# 3 CHLA_0003 2 days 22:59 IV 650 mg 28 21

Reordering factors by group using fct_relevel only change factor order in first group

Context: I need to use factor order to make arrange diplays a table in a certain way. I.e I want to get a "Total" line at the end of each group.
Problem: using fct_relevel I acheive the expected output only within the first group.
Repex:
library(dplyr)
library(forcats)
total_masses <- starwars %>%
group_by(species) %>%
summarise(mass = sum(mass),
name = "Total mass")
df <- bind_rows(starwars, total_masses) %>%
select(species, name, mass) %>%
group_by(species) %>%
mutate(name = fct_relevel(name, "Total mass", after = Inf)) %>%
arrange(mass, name, species)
df
#> # A tibble: 125 x 3
#> # Groups: species [38]
#> species name mass
#> <chr> <fct> <dbl>
#> 1 Aleena Ratts Tyerell 15
#> 2 Aleena Total mass 15
#> 3 Yoda's species Total mass 17 # I expect this line to be at the end of the Yoda's species group
#> 4 Yoda's species Yoda 17
#> 5 Ewok Total mass 20
#> 6 Ewok Wicket Systri Warrick 20
#> 7 Droid R2-D2 32
#> 8 Droid R5-D4 32
#> 9 Dug Total mass 40
#> 10 Dug Sebulba 40
#> # ... with 115 more rows
Created on 2021-05-31 by the reprex package (v2.0.0)
I expect each Total mass to be at the end of each species category:
species name mass
<chr> <fct> <dbl>
1 Aleena Ratts Tyerell 15
2 Aleena Total mass 15
3 Yoda's species Yoda 17
4 Yoda's species Total mass 17
etc...
I feel like the solution is very obvious but I was not able to find the right keywords to solve it... Any tip is welcome!
Then just do the fct_relevel without grouping, then group again and do anything you want within group.
library(dplyr)
library(forcats)
total_masses <- starwars %>%
group_by(species) %>%
summarise(mass = sum(mass),
name = "Total mass")
# your codes
df <- bind_rows(starwars, total_masses) %>%
select(species, name, mass) %>%
group_by(species) %>%
mutate(name = fct_relevel(name, "Total mass", after = Inf)) %>%
arrange(mass, name, species)
# Which result Total Mass is at 2nd so it will be above other levels
levels(df$name)
#> [1] "Ratts Tyerell" "Total mass" "Dexter Jettster"
#> [4] "Ki-Adi-Mundi" "Mas Amedda" "Zam Wesell"
#> [7] "BB8" "C-3PO" "IG-88"
...
Instead of that you can ungroup then fct_relevel, then group_by then arrange.
df <- bind_rows(starwars, total_masses) %>%
select(species, name, mass) %>% ungroup() %>%
mutate(name = fct_relevel(name, "Total mass", after = Inf)) %>%
group_by(species) %>%
arrange(mass, name, species)
# Output
df
#> # A tibble: 125 x 3
#> # Groups: species [38]
#> species name mass
#> <chr> <fct> <dbl>
#> 1 Aleena Ratts Tyerell 15
#> 2 Aleena Total mass 15
#> 3 Yoda's species Yoda 17
#> 4 Yoda's species Total mass 17
#> 5 Ewok Wicket Systri Warrick 20
#> 6 Ewok Total mass 20
#> 7 Droid R2-D2 32
#> 8 Droid R5-D4 32
#> 9 Dug Sebulba 40
#> 10 Dug Total mass 40
#> # … with 115 more rows
# As you see now Total Mass is at the last item
levels(df$name)
#> [1] "Ackbar" "Adi Gallia" "Anakin Skywalker"
#> [4] "Arvel Crynyd" "Ayla Secura" "Bail Prestor Organa"
#> [7] "Barriss Offee" "BB8" "Ben Quadinaros"
#> [10] "Beru Whitesun lars" "Bib Fortuna" "Biggs Darklighter"
#> [13] "Boba Fett" "Bossk" "C-3PO"
#> [16] "Captain Phasma" "Chewbacca" "Cliegg Lars"
#> [19] "Cordé" "Darth Maul" "Darth Vader"
#> [22] "Dexter Jettster" "Dooku" "Dormé"
#> [25] "Dud Bolt" "Eeth Koth" "Finis Valorum"
#> [28] "Finn" "Gasgano" "Greedo"
#> [31] "Gregar Typho" "Grievous" "Han Solo"
#> [34] "IG-88" "Jabba Desilijic Tiure" "Jango Fett"
#> [37] "Jar Jar Binks" "Jek Tono Porkins" "Jocasta Nu"
#> [40] "Ki-Adi-Mundi" "Kit Fisto" "Lama Su"
#> [43] "Lando Calrissian" "Leia Organa" "Lobot"
#> [46] "Luke Skywalker" "Luminara Unduli" "Mace Windu"
#> [49] "Mas Amedda" "Mon Mothma" "Nien Nunb"
#> [52] "Nute Gunray" "Obi-Wan Kenobi" "Owen Lars"
#> [55] "Padmé Amidala" "Palpatine" "Plo Koon"
#> [58] "Poe Dameron" "Poggle the Lesser" "Quarsh Panaka"
#> [61] "Qui-Gon Jinn" "R2-D2" "R4-P17"
#> [64] "R5-D4" "Ratts Tyerell" "Raymus Antilles"
#> [67] "Rey" "Ric Olié" "Roos Tarpals"
#> [70] "Rugor Nass" "Saesee Tiin" "San Hill"
#> [73] "Sebulba" "Shaak Ti" "Shmi Skywalker"
#> [76] "Sly Moore" "Tarfful" "Taun We"
#> [79] "Tion Medon" "Wat Tambor" "Watto"
#> [82] "Wedge Antilles" "Wicket Systri Warrick" "Wilhuff Tarkin"
#> [85] "Yarael Poof" "Yoda" "Zam Wesell"
#> [88] "Total mass"
Created on 2021-05-31 by the reprex package (v2.0.0)

R: if_else, ifelse

Am new to R and getting to it after a long time as well.
I got the NFL week 1 data from the site and trying to execute a small ifelse and it doesn't work.
I want the team name in the result field
#this works only when condition is true and doesn't when condition is false
wk1$tm_won= if_else(wk1$home_score < wk1$away_score, wk1$away_team, wk1$home_team)
#this doesn't work - gives me the difference
wk1$tm_won1 <- ifelse(wk1$home_score < wk1$away_score, wk1$away_team, wk1$home_team)
#this doesn't work - gives me difference and not the team name
wk1 %>%
mutate(tm_won2 = ifelse (home_score < away_score, away_team,
ifelse (home_score > away_score, home_team, NA)))
#this doesn't work
wk1 %>%
mutate(tm_won = case_when(
home_score < away_score ~ away_team,
home_score > away_score ~ home_team,
TRUE ~ a ))# DRAW
My result dataset:
season week home_team away_team home_score away_score tm_won tm_won1 tm_won2
2019 1 CHI GB 3 10 GB 7 7
2019 1 CAR LA 27 30 LA 11 11
2019 1 PHI WAS 32 27 <NA> 14 14
2019 1 NYJ BUF 16 17 BUF 3 3
2019 1 MIN ATL 28 12 <NA> 9 9
What's happening here is that your teams are actually factors (which under the hood are just integer vectors) instead of characters. If the columns are characters, essentially all of your solutions work as expected. The reason the output is different is because dplyr tries to be clever about factors and keep them human-readable when it can.
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
wk1 <- read.table(text = 'season week home_team away_team home_score away_score tm_won tm_won1 tm_won2
2019 1 CHI GB 3 10 GB 7 7
2019 1 CAR LA 27 30 LA 11 11
2019 1 PHI WAS 32 27 <NA> 14 14
2019 1 NYJ BUF 16 17 BUF 3 3
2019 1 MIN ATL 28 12 <NA> 9 9', header = T, stringsAsFactors = FALSE)
if_else(wk1$home_score < wk1$away_score, wk1$away_team, wk1$home_team)
#> [1] "GB" "LA" "PHI" "BUF" "MIN"
ifelse(wk1$home_score < wk1$away_score, wk1$away_team, wk1$home_team)
#> [1] "GB" "LA" "PHI" "BUF" "MIN"
wk1 <- read.table(text = 'season week home_team away_team home_score away_score tm_won tm_won1 tm_won2
2019 1 CHI GB 3 10 GB 7 7
2019 1 CAR LA 27 30 LA 11 11
2019 1 PHI WAS 32 27 <NA> 14 14
2019 1 NYJ BUF 16 17 BUF 3 3
2019 1 MIN ATL 28 12 <NA> 9 9', header = T)
if_else(wk1$home_score < wk1$away_score, wk1$away_team, wk1$home_team)
#> Warning in `[<-.factor`(`*tmp*`, i, value = structure(c(5L, 3L), .Label =
#> c("CAR", : invalid factor level, NA generated
#> [1] GB LA <NA> BUF <NA>
#> Levels: ATL BUF GB LA WAS
ifelse(wk1$home_score < wk1$away_score, wk1$away_team, wk1$home_team)
#> [1] 3 4 5 2 3
Created on 2019-10-09 by the reprex package (v0.3.0)
Another option is to use as.character in the if_else
dplyr::if_else(wk1$home_score < wk1$away_score,
as.character(wk1$away_team),
as.character(wk1$home_team))

R function to fix automatically formatted data

I am currently analyzing a baseball data set that has the count data included, however, some of the data has automatically been formatted as a date.
I have already tried using as.numeric but it does not help. I have provided a sample of the data below:
Count(Factor) 0-0 0-1 0-2 1-Feb 1-Jan 1-Mar 2-Feb 2-Jan 2-Mar
Feb-00 Jan-00 Mar-00
I would like to remove the date format. For instance, I want to see 1-Feb as 1-2, 1-Jan as 1-1, 1-Mar as 1-3, Feb-00 as 2-0.
Does anyone have any suggestions on how to do so?
You can replace the abbreviated months with their relevant calendar position by referencing months.abb. Below I have created a general function using Base R.
## function to apply
month_num <- function(x){
if (! grepl('\\w{3}', x))
return(x)
gsub('/?\\w{3}', as.character(match(regmatches(x, regexpr('(\\w{3})', x)), month.abb)), x)
}
## vector
strings <- c( '0-0', '0-1' ,'0-2', '1-Feb', '1-Jan', '1-Mar', '2-Feb', '2-Jan', '2-Mar', 'Feb-00', '/Jan-00', 'Mar-00')
sapply(strings, month_num, USE.NAMES = FALSE)
#> [1] "0-0" "0-1" "0-2" "1-2" "1-1" "1-3" "2-2" "2-1" "2-3" "2-00"
#> [11] "1-00" "3-00"
## data.frame or matrix
tmp <- data.frame(
strings = c( '0-0', '0-1' ,'0-2', '1-Feb', '1-Jan', '1-Mar', '2-Feb', '2-Jan', '2-Mar', 'Feb-00', '/Jan-00', 'Mar-00')
)
tmp$strings <- apply(tmp, 1, month_num)
tmp
#> strings
#> 1 0-0
#> 2 0-1
#> 3 0-2
#> 4 1-2
#> 5 1-1
#> 6 1-3
#> 7 2-2
#> 8 2-1
#> 9 2-3
#> 10 2-00
#> 11 1-00
#> 12 3-00
## list
strings <- list( '0-0', '0-1' ,'0-2', '1-Feb', '1-Jan', '1-Mar', '2-Feb', '2-Jan', '2-Mar', 'Feb-00', '/Jan-00', 'Mar-00')
strings <- lapply(strings, month_num)
tail(strings)
#> [[1]]
#> [1] "2-2"
#>
#> [[2]]
#> [1] "2-1"
#>
#> [[3]]
#> [1] "2-3"
#>
#> [[4]]
#> [1] "2-00"
#>
#> [[5]]
#> [1] "1-00"
#>
#> [[6]]
#> [1] "3-00"
Created on 2019-02-12 by the reprex package (v0.2.1)

Resources