Keep column names for createDummyFeatures "reference" (n-1) - r

I have this kind of data.
library(dplyr)
glimpse(samp)
Observations: 5
Variables: 5
$ review_count <int> 68, 3, 7, 9, 5
$ Alcohol <fct> full_bar, NA, full_bar, beer_and_wi...
$ BikeParking <fct> True, NA, False, NA, NA
$ BusinessAcceptsBitcoin <fct> NA, NA, NA, NA, NA
$ BusinessAcceptsCreditCards <fct> True, NA, NA, True, True
I want to create 1-p dummy features. The createDummyFeatures function of the mlr package has the option reference to do this.
library(mlr)
dummy = createDummyFeatures(samp, target = "review_count", method = "reference")
The problem is that it doesn´t keep the original column names.
glimpse(dummy)
Observations: 5
Variables: 6
$ review_count <int> 68, 3, 7, 9, 5
$ Alcohol.full_bar <dbl> 1, NA, 1, 0, NA
$ Alcohol.none <dbl> 0, NA, 0, 0, NA
$ True <dbl> 1, NA, 0, NA, NA
$ True.1 <dbl> NA, NA, NA, NA, NA
$ True.2 <dbl> 1, NA, NA, 1, 1
The question is how can I keep them?
An Idea is to create them by the 1-of-nmethod and then remove all columns which contain "False".
dummy2 = createDummyFeatures(samp, target = "review_count")
dummy2 = dummy2 %>%
select(-contains("False"))
glimpse(dummy2)
Observations: 5
Variables: 7
$ review_count <int> 68, 3, 7, 9, 5
$ Alcohol.beer_and_wine <dbl> 0, NA, 0, 1, NA
$ Alcohol.full_bar <dbl> 1, NA, 1, 0, NA
$ Alcohol.none <dbl> 0, NA, 0, 0, NA
$ BikeParking.True <dbl> 1, NA, 0, NA, NA
$ BusinessAcceptsBitcoin.True <dbl> NA, NA, NA, NA, NA
$ BusinessAcceptsCreditCards.True <dbl> 1, NA, NA, 1, 1
However, I don´t know if it is the same as n-1 especially for the factors with more then 2 levels (The dummy coding is for an XGBoost regression where "review count" is the target variable).
dput(samp)
structure(list(review_count = c(68L, 3L, 7L, 9L, 5L), Alcohol = structure(c(2L,
NA, 2L, 1L, NA), .Label = c("beer_and_wine", "full_bar", "none"
), class = "factor"), BikeParking = structure(c(2L, NA, 1L, NA,
NA), .Label = c("False", "True"), class = "factor"), BusinessAcceptsBitcoin = structure(c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_), .Label = c("False",
"True"), class = "factor"), BusinessAcceptsCreditCards = structure(c(2L,
NA, NA, 2L, 2L), .Label = c("False", "True"), class = "factor")), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -5L))
Edit
For those who have the same problem, I fixed this issue using caret.
library(caret)
dummy_dat = dummyVars("~ .", data = samp, fullRank = T)
dat = data.frame(predict(dummy_dat, newdata = samp))

Related

Applying the same operation with multiple columns of similar names in R

I'm wondering if there is a way to simplify this code to avoid repetition givent that the column names are similar excepting one character that increases for each operation.
out <- df %>%
mutate (ATN1.1 = ifelse(Status == 1, NA_integer_, -100 * log(Sen1Ch1/RefCh1)),
ATN2.1 = ifelse(Status == 1, NA_integer_, -100 * log(Sen1Ch2/RefCh2)),
ATN3.1 = ifelse(Status == 1, NA_integer_, -100 * log(Sen1Ch3/RefCh3)),
ATN4.1 = ifelse(Status == 1, NA_integer_, -100 * log(Sen1Ch4/RefCh4)),
ATN5.1 = ifelse(Status == 1, NA_integer_, -100 * log(Sen1Ch5/RefCh5)),
ATN6.1 = ifelse(Status == 1, NA_integer_, -100 * log(Sen1Ch6/RefCh6)),
ATN7.1 = ifelse(Status == 1, NA_integer_, -100 * log(Sen1Ch7/RefCh7)))
This is a small subset of my data if you wanna play with it
df = structure(list(Status = c(1, 17, 1, 1, 1, 1, 2, 0, 0, 0), ATN1.1 = c(NA,
NA, NA, NA, NA, NA, 0, 0.187761662304176, 0.373310604025045,
0.570139498143909), ATN2.1 = c(NA, NA, NA, NA, NA, NA, 0, 0.136443172947395,
0.269071359915515, 0.407552762179439), ATN3.1 = c(NA, NA, NA,
NA, NA, NA, 0, 0.113733164068766, 0.224219770615697, 0.336923929839777
), ATN4.1 = c(NA, NA, NA, NA, NA, NA, 0, 0.0942969310983806,
0.186894753425896, 0.279629737677226), ATN5.1 = c(NA, NA, NA,
NA, NA, NA, 0, 0.0753327883349684, 0.149617411430523, 0.22690457078205
), ATN6.1 = c(NA, NA, NA, NA, NA, NA, 0, 0.0493106158715682,
0.100348708536177, 0.155828822066352), ATN7.1 = c(NA, NA, NA,
NA, NA, NA, 0, 0.0526398637123631, 0.103191368342497, 0.154644102801848
), ATN0.1.1 = c(NA, NA, NA, NA, NA, NA, 15.054824247419, 15.054824247419,
15.054824247419, 15.054824247419), ATN0.2.1 = c(NA, NA, NA, NA,
NA, NA, 24.1338734012274, 24.1338734012274, 24.1338734012274,
24.1338734012274), ATN0.3.1 = c(NA, NA, NA, NA, NA, NA, 27.4233147524393,
27.4233147524393, 27.4233147524393, 27.4233147524393), ATN0.4.1 = c(NA,
NA, NA, NA, NA, NA, 20.8560560826831, 20.8560560826831, 20.8560560826831,
20.8560560826831), ATN0.5.1 = c(NA, NA, NA, NA, NA, NA, 17.1645092239121,
17.1645092239121, 17.1645092239121, 17.1645092239121), ATN0.6.1 = c(NA,
NA, NA, NA, NA, NA, 4.4180613710882, 4.4180613710882, 4.4180613710882,
4.4180613710882), ATN0.7.1 = c(NA, NA, NA, NA, NA, NA, 10.8192165605015,
10.8192165605015, 10.8192165605015, 10.8192165605015), Sen1Ch1 = c(0,
99, 0, 783198, 785643, 787093, 786717, 785935, 784922, 783784
), Sen2Ch1 = c(0, 324, 0, 793643, 796398, 798041, 798658, 798957,
799003, 798951), Sen1Ch2 = c(0, 53, 0, 739627, 741339, 742308,
741804, 741195, 740403, 739520), Sen2Ch2 = c(0, 416, 0, 743716,
745420, 746399, 746532, 746599, 746467, 746279), Sen1Ch3 = c(0,
49, 0, 720709, 722113, 722900, 722515, 722002, 721364, 720681
), Sen2Ch3 = c(0, 294, 0, 734485, 735877, 736650, 736749, 736783,
736664, 736513), Sen1Ch4 = c(0, 61, 0, 732332, 732529, 732487,
731524, 730678, 729723, 728756), Sen2Ch4 = c(0, 222, 0, 737261,
737172, 736976, 736329, 735869, 735302, 734762), Sen1Ch5 = c(0,
59, 0, 765776, 767327, 768116, 767883, 767617, 767121, 766567
), Sen2Ch5 = c(0, 248, 0, 775632, 777074, 777800, 777883, 777970,
777832, 777655), Sen1Ch6 = c(0, 57, 0, 899145, 901398, 902644,
902723, 902737, 902436, 902095), Sen2Ch6 = c(0, 352, 0, 926157,
928263, 929423, 929746, 930043, 930042, 930025), Sen1Ch7 = c(0,
45, 0, 845802, 848332, 849736, 849960, 850137, 849979, 849764
), Sen2Ch7 = c(0, 360, 0, 867160, 869852, 871321, 871830, 872308,
872428, 872500), RefCh1 = c(0, 10100, 0, 908802, 911770, 913546,
914536, 915344, 915862, 916336), RefCh2 = c(0, 6200, 0, 940232,
942473, 943743, 944281, 944794, 945037, 945218), RefCh3 = c(0,
6200, 0, 947069, 948944, 950017, 950484, 950890, 951100, 951271
), RefCh4 = c(0, 14700, 0, 900977, 901433, 901543, 901167, 900974,
900630, 900271), RefCh5 = c(0, 8250, 0, 908355, 910304, 911295,
911674, 912045, 912133, 912179), RefCh6 = c(0, 6200, 0, 939365,
941703, 942978, 943500, 943980, 944147, 944314), RefCh7 = c(0,
6200, 0, 941728, 944713, 946375, 947078, 947774, 948077, 948325
)), row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"
))
You can feed dynamic variable names to mutate with !!sym for example:
for(i in 1:7){
out <- df %>%
mutate(!!sym(sprintf("ATN%s.1",i)) := ifelse(Status == 1, NA_integer_, -100 * log(!!sym(paste0("Sen1Ch",i))/!!sym(paste0("RefCh",i)))))
}
Note you need := inside the mutate.
Here is a base r solution with mapply. First define an auxiliary function f to make the code more readable, then get the column names to be changed and that take part in the formula with regular expressions, finally, csall the function f in a mapply loop.
f <- function(x, y, Status) {
ifelse(Status == 1, NA_integer_, -100 * log(x/y))
}
atn <- grep("^ATN\\d\\.1$", names(df), value = TRUE)
sen1ch <- grep("^Sen1Ch", names(df), value = TRUE)
refch <- grep("^RefCh", names(df), value = TRUE)
df[atn] <- mapply(f, df[sen1ch], df[refch], MoreArgs = list(Status = df$Status))
df
#> # A tibble: 10 x 36
#> Status ATN1.1 ATN2.1 ATN3.1 ATN4.1 ATN5.1 ATN6.1 ATN7.1 ATN0.1.1 ATN0.2.1
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 NA NA NA NA NA NA NA NA NA
#> 2 17 463. 476. 484. 548. 494. 469. 493. NA NA
#> 3 1 NA NA NA NA NA NA NA NA NA
#> 4 1 NA NA NA NA NA NA NA NA NA
#> 5 1 NA NA NA NA NA NA NA NA NA
#> 6 1 NA NA NA NA NA NA NA NA NA
#> 7 2 15.1 24.1 27.4 20.9 17.2 4.42 10.8 15.1 24.1
#> 8 0 15.2 24.3 27.5 21.0 17.2 4.47 10.9 15.1 24.1
#> 9 0 15.4 24.4 27.6 21.0 17.3 4.52 10.9 15.1 24.1
#> 10 0 15.6 24.5 27.8 21.1 17.4 4.57 11.0 15.1 24.1
#> # ... with 26 more variables: ATN0.3.1 <dbl>, ATN0.4.1 <dbl>, ATN0.5.1 <dbl>,
#> # ATN0.6.1 <dbl>, ATN0.7.1 <dbl>, Sen1Ch1 <dbl>, Sen2Ch1 <dbl>,
#> # Sen1Ch2 <dbl>, Sen2Ch2 <dbl>, Sen1Ch3 <dbl>, Sen2Ch3 <dbl>, Sen1Ch4 <dbl>,
#> # Sen2Ch4 <dbl>, Sen1Ch5 <dbl>, Sen2Ch5 <dbl>, Sen1Ch6 <dbl>, Sen2Ch6 <dbl>,
#> # Sen1Ch7 <dbl>, Sen2Ch7 <dbl>, RefCh1 <dbl>, RefCh2 <dbl>, RefCh3 <dbl>,
#> # RefCh4 <dbl>, RefCh5 <dbl>, RefCh6 <dbl>, RefCh7 <dbl>
Created on 2022-04-14 by the reprex package (v2.0.1)

transpose from one variable under another in R

here example of my data
mydat=structure(list(ADR.N.14.0 = c(8140010250001, 8140010250002),
NOMYAR.N.16.6 = c(1, 1), KOFPOR1.N.16.6 = c(7, 10), POR1.C.254 = c("о",
"BB"), VOZPOR1.N.16.6 = c(80, 45), VYSPOR1.N.16.6 = c(24,
17), DEMPOR1.N.16.6 = c(36, 16), POLNOT1.N.16.6 = c(0.6,
0.9), ZAPZAH1.N.16.6 = c(210, 160), NOMYAR2.N.16.6 = c(1,
1), KOFSOCT2.N.16.6 = c(3, 0), POR2.C.254 = c("BB", "о"),
VOZPOR2.N.16.6 = c(70, 45), VYSPOR2.N.16.6 = c(22, 17), DEMPOR2.N.16.6 = c(26,
22), POLNOT2.N.16.6 = c(0, 0), ZAPZAH2.N.16.6 = c(0, 0)), class = "data.frame", row.names = c(NA,
-2L))
how for each value of ADR,N,14,0move data from one variable under another.
To be more clear
here variables with prefix1
NOMYAR,N,16,6 KOFPOR**1**,N,16,6 POR**1**,C,254 VOZPOR**1**,N,16,6 VYSPOR**1**,N,16,6 DEMPOR**1**,N,16,6 POLNOT**1**,N,16,6 ZAPZAH**1**,N,16,6
and near rows with prefix2
NOMYAR**2**,N,16,6 KOFPOR**2**,N,16,6 POR**2**,C,254 VOZPOR**2**,N,16,6 VYSPOR**1**,N,16,6 DEMPOR**2**,N,16,6 POLNOT**2**,N,16,6 ZAPZAH**2**,N,16,6
so i need that for for ADR,N,14,0 =8140010250001
the content of the fields with the prefix 2 was under the content of the fields with the prefix 1
like this
result=structure(list(ADR.N.14.0 = c(8140010250001, 8140010250001, 8140010250002,
8140010250002, NA, NA, NA, NA, NA, NA), NOMYAR.N.16.6 = c(1,
1, 1, 1, NA, NA, NA, NA, NA, NA), KOFPOR1.N.16.6 = c(7, 3, 10,
0, NA, NA, NA, NA, NA, NA), POR1.C.254 = c("о", "BB", "BB", "о",
"", "", "", "", "", ""), VOZPOR1.N.16.6 = c(80, 70, 45, 45, NA,
NA, NA, NA, NA, NA), VYSPOR1.N.16.6 = c(24, 22, 17, 17, NA, NA,
NA, NA, NA, NA), DEMPOR1.N.16.6 = c(36, 26, 16, 22, NA, NA, NA,
NA, NA, NA), POLNOT1.N.16.6 = c(0.6, 0, 0.9, 0, NA, NA, NA, NA,
NA, NA), ZAPZAH1.N.16.6 = c(210, 0, 160, 0, NA, NA, NA, NA, NA,
NA)), class = "data.frame", row.names = c(NA, -10L))
How can i do such transpose?
You can use pivot_longer and specify names_pattern to include pattern of names that you want together.
tidyr::pivot_longer(mydat, cols = -ADR.N.14.0,
names_to = c('.value'),
names_pattern = '(.*?)\\d?\\..*')
# ADR.N.14.0 NOMYAR KOFPOR POR VOZPOR VYSPOR DEMPOR POLNOT ZAPZAH KOFSOCT
# <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 8140010250001 1 7 о 80 24 36 0.6 210 3
#2 8140010250001 1 NA BB 70 22 26 0 0 NA
#3 8140010250002 1 10 BB 45 17 16 0.9 160 0
#4 8140010250002 1 NA о 45 17 22 0 0 NA

R reshape name value pairs from wide to long using pivot_longer

I am trying to figure out how to reshape a dataset of the names of political parties from wide to long using dplyr and pivot_longer.
For each Party_ID, there is a number of constant columns attached (Party_Name_Short, Party_Name, Country, Party_in_orig_title) and a number of time changing factors as well: election, Date, Rename, Reason, Party_Title, alliance, member_parties, split, parent_party, merger, child_party, successor, predecessor. The time changing factors were recorded up to 11 times for each party, as reflected by the index in the colname.
In order to provide a sample I selected the first three time changing columns for each party and a sample of 5 random rows:
structure(list(Party_Name_Short = c("LZJ-PS", "ZiZi", "MNR",
"MDP", "E200"), Party_Name = c("Lista Zorana Jankovica – Pozitivna Slovenija",
"Živi zid", "Mouvement national républicain", "Movimento Democrático Português",
"Erakond Eesti 200"), Country = c("SVN", "HRV", "FRA", "PRT",
"EST"), Party_ID = c(1987, 2612, 1263, 1281, 2720), Party_in_orig_title = c(0,
0, 0, 0, 0), Date1 = c(2011, NA, 1999, 1987, NA), Rename1 = c("Lista Zorana Jankovica – Pozitivna Slovenija",
NA, "Mouvement national républicain", "ID", NA), Reason1 = c("foundation",
NA, "split from FN", "split", NA), Party_Title1 = c(0, NA, 0,
0, NA), alliance1 = c(0, NA, 0, 0, NA), member_parties1 = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_),
split1 = c(0, NA, 1, 1, NA), parent_party1 = c(NA, NA, "FN",
"MDP", NA), merger1 = c(0, NA, 0, 0, NA), child_party1 = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_
), successor1 = c(0, NA, 0, 0, NA), predecessor1 = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_
), Date2 = c(2012, NA, NA, NA, NA), Rename2 = c("Pozitivna Slovenija",
NA, NA, NA, NA), Reason2 = c("renamed", NA, NA, NA, NA),
Party_Title2 = c(0, NA, NA, NA, NA), alliance2 = c(0, NA,
NA, NA, NA), member_parties2 = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), split2 = c(0,
NA, NA, NA, NA), parent_party2 = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), merger2 = c(0,
NA, NA, NA, NA), child_party2 = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), successor2 = c(0,
NA, NA, NA, NA), predecessor2 = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), Date3 = c(2014,
NA, NA, NA, NA), Rename3 = c("ZaAB", NA, NA, NA, NA), Reason3 = c("split",
NA, NA, NA, NA), Party_Title3 = c(0, NA, NA, NA, NA), alliance3 = c(0,
NA, NA, NA, NA), member_parties3 = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), split3 = c(1,
NA, NA, NA, NA), parent_party3 = c("LZJ-PS", NA, NA, NA,
NA), merger3 = c(0, NA, NA, NA, NA), child_party3 = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_
), successor3 = c(0, NA, NA, NA, NA), predecessor3 = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_
), election1 = structure(c(15309, 16740, 11839, 6390, 17956
), class = "Date"), election2 = structure(c(16252, NA, NA,
NA, NA), class = "Date"), election3 = structure(c(16344,
NA, NA, NA, NA), class = "Date")), row.names = c(NA, -5L), class = c("tbl_df",
"tbl", "data.frame"))
I would like the data to follow a "long" structure where each party_id and the constant factors are repeated 11 times and there are single columns for the time changing factors. Following the top-rated answer formulated here I tried different variations of the following command:
pivot_longer(cols = starts_with(c("election", "Date", "Rename", "Reason", "Party_Title",
"alliance", "member_parties", "split", "parent_party",
"merger", "child_party", "successor", "predecessor")),
names_to = c(".value", "election", "Date", "Rename", "Reason", "Party_Title",
"alliance", "member_parties", "split", "parent_party",
"merger", "child_party", "successor", "predecessor"), names_sep = "_") %>%
select(-matches("election[1-9]"), -matches("Date[1-9]"), -matches("Rename[1-9]"),
-matches("Reason[1-9]"), -matches("alliance[1-9]"), -matches("member_parties[1-9]"),
-matches("split[1-9]"), -matches("parent_party[1-9]"), -matches("merger[1-9]"),
-matches("child_party[1-9]"), -matches("successor[1-9]"), -matches("predecessor[1-9]"),
-matches("Party_Title[1-9]"), -matches("election1[0-2]"), -matches("Date1[0-2]"), -matches("Rename1[0-2]"),
-matches("Reason1[0-2]"), -matches("alliance1[0-2]"), -matches("member_parties1[0-2]"),
-matches("split1[0-2]"), -matches("parent_party1[0-2]"), -matches("merger1[0-2]"),
-matches("child_party1[0-2]"), -matches("successor1[0-2]"), -matches("predecessor1[0-2]"),
-matches("Party_Title1[0-2]"))
However, for some reason, I get a lot of missing values and do not achieve the shape of the data I would like to have. I'd appreciate any hint if you have an idea of how to do this. Thanks!
Update:
I would like the final output to look something like:
structure(list(Party_Name_Short = c("LZJ-PS", "ZiZi", "MNR",
"MDP", "E200", "LZJ-PS", "ZiZi", "MNR", "MDP", "E200", "LZJ-PS",
"ZiZi", "MNR", "MDP", "E200"), Party_Name = c("Lista Zorana Jankovica – Pozitivna Slovenija",
"Živi zid", "Mouvement national républicain", "Movimento Democrático Português",
"Erakond Eesti 200", "Lista Zorana Jankovica – Pozitivna Slovenija",
"Živi zid", "Mouvement national républicain", "Movimento Democrático Português",
"Erakond Eesti 200", "Lista Zorana Jankovica – Pozitivna Slovenija",
"Živi zid", "Mouvement national républicain", "Movimento Democrático Português",
"Erakond Eesti 200"), Country = c("SVN", "HRV", "FRA", "PRT",
"EST", "SVN", "HRV", "FRA", "PRT", "EST", "SVN", "HRV", "FRA",
"PRT", "EST"), Party_ID = c(1987, 2612, 1263, 1281, 2720, 1987,
2612, 1263, 1281, 2720, 1987, 2612, 1263, 1281, 2720), Party_in_orig_title = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), time = c(1, 1, 1,
1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3), Date = c(2011, NA, 1999,
1987, NA, 2012, NA, NA, NA, NA, 2014, NA, NA, NA, NA), Rename = c("Lista Zorana Jankovica – Pozitivna Slovenija",
NA, "Mouvement national républicain", "ID", NA, "Pozitivna Slovenija",
NA, NA, NA, NA, "ZaAB", NA, NA, NA, NA), Reason = c("foundation",
NA, "split from FN", "split", NA, "renamed", NA, NA, NA, NA,
"split", NA, NA, NA, NA), Party_Title = c(0, NA, 0, 0, NA, 0,
NA, NA, NA, NA, 0, NA, NA, NA, NA), alliance = c(0, NA, 0, 0,
NA, 0, NA, NA, NA, NA, 0, NA, NA, NA, NA), member_parties = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), split = c(0,
NA, 1, 1, NA, 0, NA, NA, NA, NA, 1, NA, NA, NA, NA), parent_party = c(NA,
NA, "FN", "MDP", NA, NA, NA, NA, NA, NA, "LZJ-PS", NA, NA, NA,
NA), merger = c(0, NA, 0, 0, NA, 0, NA, NA, NA, NA, 0, NA, NA,
NA, NA), child_party = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), successor = c(0, NA, 0, 0, NA, 0, NA,
NA, NA, NA, 0, NA, NA, NA, NA), predecessor = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), election = structure(c(1322697600,
1446336000, 1022889600, 552096000, 1551398400, 1404172800, NA,
NA, NA, NA, 1412121600, NA, NA, NA, NA), class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -15L), class = c("tbl_df",
"tbl", "data.frame"))
Notice: the newly added time column and notice that this is only for example purposes, with three time changing factors, whereas in fact there are 11 in the data.
Using pivot_longer with names_sep to split between a non-digit and a digit at the end of the string
library(tidyr)
library(dplyr)
df1 %>%
pivot_longer(cols = matches('\\d+$'), names_to = c(".value", 'time'),
names_sep="(?<=\\D)(?=\\d+$)") %>%
arrange(time)
# A tibble: 15 x 19
# Party_Name_Short Party_Name Country Party_ID Party_in_orig_t… time Date Rename Reason Party_Title alliance member_parties split
# <chr> <chr> <chr> <dbl> <dbl> <chr> <dbl> <chr> <chr> <dbl> <dbl> <chr> <dbl>
# 1 LZJ-PS Lista Zor… SVN 1987 0 1 2011 Lista… found… 0 0 <NA> 0
# 2 ZiZi Živi zid HRV 2612 0 1 NA <NA> <NA> NA NA <NA> NA
# 3 MNR Mouvement… FRA 1263 0 1 1999 Mouve… split… 0 0 <NA> 1
# 4 MDP Movimento… PRT 1281 0 1 1987 ID split 0 0 <NA> 1
# 5 E200 Erakond E… EST 2720 0 1 NA <NA> <NA> NA NA <NA> NA
# 6 LZJ-PS Lista Zor… SVN 1987 0 2 2012 Pozit… renam… 0 0 <NA> 0
# 7 ZiZi Živi zid HRV 2612 0 2 NA <NA> <NA> NA NA <NA> NA
# 8 MNR Mouvement… FRA 1263 0 2 NA <NA> <NA> NA NA <NA> NA
# 9 MDP Movimento… PRT 1281 0 2 NA <NA> <NA> NA NA <NA> NA
#10 E200 Erakond E… EST 2720 0 2 NA <NA> <NA> NA NA <NA> NA
#11 LZJ-PS Lista Zor… SVN 1987 0 3 2014 ZaAB split 0 0 <NA> 1
#12 ZiZi Živi zid HRV 2612 0 3 NA <NA> <NA> NA NA <NA> NA
#13 MNR Mouvement… FRA 1263 0 3 NA <NA> <NA> NA NA <NA> NA
#14 MDP Movimento… PRT 1281 0 3 NA <NA> <NA> NA NA <NA> NA
#15 E200 Erakond E… EST 2720 0 3 NA <NA> <NA> NA NA <NA> NA
# … with 6 more variables: parent_party <chr>, merger <dbl>, child_party <chr>, successor <dbl>, predecessor <chr>, election <date>

r - merge rows in group while replacing NAs [duplicate]

This question already has answers here:
Collapsing rows where some are all NA, others are disjoint with some NAs
(5 answers)
Closed 4 years ago.
I was trying to find the answer to this but couldn't. If there is an answer I apologize and will immediately delete my question.
I'm trying to merge several rows into one (this calculation should be done separately on groups, in this case variable id can be used to group), so that no NA values are left.
# initial dataframe
df_start <- data.frame(
id = c("as", "as", "as", "as", "as", "bs", "bs", "bs", "bs", "bs"),
b = c(NA, NA, NA, NA, "A", NA, NA, 6, NA, NA),
c = c(2, NA, NA, NA, NA, 7, NA, NA, NA, NA),
d = c(NA, 4, NA, NA, NA, NA, 8, NA, NA, NA),
e = c(NA, NA, NA, 3, NA, NA, NA, NA, "B", NA),
f = c(NA, NA, 5, NA, NA, NA, NA, NA, NA, 10))
# desired output
df_end <- data.frame(id = c("as", "bs"),
b = c("A", 6),
c = c(2, 7),
d = c(4, 8),
e = c(3,"B"),
f = c(5, 10))
No need to delete the question, it may be helpful to some users. This summarises each group to the first non NA occurrence for each column.
library(dplyr)
df_start <- data.frame(
id = c("as", "as", "as", "as", "as", "bs", "bs", "bs", "bs", "bs"),
b = c(NA, NA, NA, NA, "A", NA, NA, 6, NA, NA),
c = c(2, NA, NA, NA, NA, 7, NA, NA, NA, NA),
d = c(NA, 4, NA, NA, NA, NA, 8, NA, NA, NA),
e = c(NA, NA, NA, 3, NA, NA, NA, NA, "B", NA),
f = c(NA, NA, 5, NA, NA, NA, NA, NA, NA, 10))
df_start %>%
group_by(id) %>%
summarise_all(list(~first(na.omit(.))))
Output:
# A tibble: 2 x 6
id b c d e f
<fct> <fct> <dbl> <dbl> <fct> <dbl>
1 as A 2. 4. 3 5.
2 bs 6 7. 8. B 10.
You will, of course, get some data lost if there is more than one occurrence of a value with each group for each column.
Hope this helps, Using dplyr
df_start <- sapply(df_start, as.character)
df_start[is.na(df_start)] <- " "
df_start <- as.data.frame(df_start)
library(dplyr)
df_start %>%
group_by(id) %>%
summarise_all(funs(trimws(paste(., collapse = '')))) -> df

R quick way to write multiple lines of code with slight variations

I am working on a project in R which is fairly code heavy at least compared to my previous R projects. The code is using multiple ifelse statements on previous columns data then creating a new column with the results. As the data I am using is a 5 minute timeframe, therefore I have to write a new line of code for every 5 minute slice of time. The data I have is from 09:30 to 16:00 so that is a lot of lines of code, around 75 by my calculations. Example of my data;
Date Open High Low Close doy
1 2015-09-21 09:30:00 164.6700 164.7100 164.3700 164.5300 264
2 2015-09-21 09:35:00 164.5300 164.9000 164.5300 164.6400 264
3 2015-09-21 09:40:00 164.6600 164.8900 164.6000 164.8900 264
4 2015-09-21 09:45:00 164.9100 165.0900 164.9100 164.9736 264
5 2015-09-21 09:50:00 164.9399 165.0980 164.8200 164.8200 264
This data is then filtered onto a table like this;
data <- structure(list(doy = c(264, 265, 266, 267, 268, 271, 272, 11,12, 13), Date = structure(c(1442824200, 1442910600, 1442997000,1443083400, 1443169800, 1443429000, 1443515400, 1452504600, 1452591000,1452677400), class = c("POSIXct", "POSIXt"), tzone = ""), Or_High = c(164.71,162.96, 163.38, 161.37, 163.91, 162.06, 160.22, 164.5, 165.23,165.84), OR_Low = c(164.37, 162.62, 162.98, 161.06, 163.57, 161.66,159.7, 164.06, 164.84, 165.4), HOD = c(165.56, 163.36, 163.38,162.24, 164.43, 162.06, 160.96, 164.5, 165.78, 165.84), LOD = c(165.22,163.1, 162.98, 161.95, 164.24, 161.66, 160.75, 164.06, 165.56,165.4), Close = c(164.92, 163.02, 162.58, 161.85, 162.94, 159.84,160.19, 163.83, 165.02, 161.38), Range = c(0.340000000000003,0.260000000000019, 0.400000000000006, 0.29000000000002, 0.189999999999998,0.400000000000006, 0.210000000000008, 0.439999999999998, 0.219999999999999,0.439999999999998), `A-val` = c(NA, NA, NA, NA, NA, NA, NA, 0.0673439999999994,0.0659639999999996, 0.0729499999999996), `A-up` = c(NA, NA, NA,NA, NA, NA, NA, 164.567344, 165.295964, 165.91295), `A-down` = c(NA,NA, NA, NA, NA, NA, NA, 163.992656, 164.774036, 165.32705), `09:35` = structure(c(NA,NA, NA, NA, NA, NA, NA, 0, 0, 0), .Dim = c(10L, 1L), .Dimnames = list(NULL, "Low")), `09:40` = structure(c(NA, NA, NA, NA, NA,NA, NA, -1, 1, 0), .Dim = c(10L, 1L), .Dimnames = list(NULL,"Low")), `09:45` = structure(c(NA, NA, NA, NA, NA, NA, NA,0, 1, 0), .Dim = c(10L, 1L), .Dimnames = list(NULL, "Low")),`09:50` = structure(c(NA, NA, NA, NA, NA, NA, NA, -1, 1,0), .Dim = c(10L, 1L), .Dimnames = list(NULL, "Low")), `09:55` = structure(c(NA,NA, NA, NA, NA, NA, NA, -1, 0, 0), .Dim = c(10L, 1L), .Dimnames = list(NULL, "Low")), `10:00` = structure(c(NA, NA, NA, NA,NA, NA, NA, -1, 0, 0), .Dim = c(10L, 1L), .Dimnames = list(NULL, "Low")), `10:05` = structure(c(NA, NA, NA, NA,NA, NA, NA, -1, 0, 0), .Dim = c(10L, 1L), .Dimnames = list(NULL, "Low")), `10:10` = structure(c(NA, NA, NA, NA,NA, NA, NA, -1, 0, 0), .Dim = c(10L, 1L), .Dimnames = list(NULL, "Low")), `10:15` = structure(c(NA, NA, NA, NA,NA, NA, NA, -2, 0, -1), .Dim = c(10L, 1L), .Dimnames = list(NULL, "Low")), `10:20` = structure(c(NA, NA, NA, NA,NA, NA, NA, 0, 0, -1), .Dim = c(10L, 1L), .Dimnames = list(NULL, "Low")), `10:25` = structure(c(NA, NA, NA, NA,NA, NA, NA, -2, -1, -1), .Dim = c(10L, 1L), .Dimnames = list(NULL, "Low")), `10:30` = structure(c(NA, NA, NA, NA,NA, NA, NA, 0, 0, -1), .Dim = c(10L, 1L), .Dimnames = list(NULL, "Low")), `10:35` = structure(c(NA, NA, NA, NA,NA, NA, NA, 0, 0, -1), .Dim = c(10L, 1L), .Dimnames = list(NULL, "Low")), `10:40` = structure(c(NA, NA, NA, NA,NA, NA, NA, 0, -1, -2), .Dim = c(10L, 1L), .Dimnames = list(NULL, "Low")), `10:45` = structure(c(NA, NA, NA, NA,NA, NA, NA, 0, -1, 0), .Dim = c(10L, 1L), .Dimnames = list(NULL, "Low")), `10:50` = structure(c(NA, NA, NA, NA,NA, NA, NA, -1, -1, -2), .Dim = c(10L, 1L), .Dimnames = list(NULL, "Low")), `10:55` = structure(c(NA, NA, NA, NA,NA, NA, NA, -1, -1, 0), .Dim = c(10L, 1L), .Dimnames = list(NULL, "Low"))), .Names = c("doy", "Date", "Or_High","OR_Low", "HOD", "LOD", "Close", "Range", "A-val", "A-up", "A-down","09:35", "09:40", "09:45", "09:50", "09:55", "10:00", "10:05","10:10", "10:15", "10:20", "10:25", "10:30", "10:35", "10:40","10:45", "10:50", "10:55"), row.names = c(1L, 2L, 3L, 4L, 5L,6L, 7L, 78L, 79L, 80L), class = "data.frame")
This is what the lines of code looks like;
data[,14] <- ifelse(df %>% filter(hour(Date) == 09 & minute(Date) == 45) %>% select(Low) > data[,10], 1, ifelse(df %>% filter(hour(Date) == 09 & minute(Date) == 45) %>% select(High) < data[,11], -1, 0))
Then the next line of code would look like;
data[,15] <- ifelse(df %>% filter(hour(Date) == 09 & minute(Date) == 50) %>% select(Low) > data[,10], 1, ifelse(df %>% filter(hour(Date) == 09 & minute(Date) == 50) %>% select(High) < data[,11], -1, 0))
And the next like this etc;
data[,16] <- ifelse(df %>% filter(hour(Date) == 09 & minute(Date) == 55) %>% select(Low) > data[,10], 1, ifelse(df %>% filter(hour(Date) == 09 & minute(Date) == 55) %>% select(High) < data[,11], -1, 0))
As you can see with each new line of code only certain parts of the code are changed, such as the hours, minutes and column references for summing. Perhaps the below example will make it clearer.
Example;
colnames(data)[14] <- "09:45"
colnames(data)[15] <- "09:50"
colnames(data)[16] <- "09:55"
colnames(data)[17] <- "10:00"
colnames(data)[18] <- "10:05"
In this code would there be anyway to change the [#col ref#] and times without individually changing each line of code by hand? I realise that copy and paste can be used with notepad but that still means having write the individual changes. My main concern is not about the time taken to write this but moreover the risk of errors from human input.
If anyone has any tips or tricks as to how this can be done, or another way of achieving the same without using multiple if statements on the structure of my existing code I would be most grateful for your help. This question is related to previous question I posted here and may add clarity for what I am trying to achieve.
Thanks.
As vanao veneri mentioned it is better to use a text editor for writing bulk code quickly.
I found that Sublime 3 with Text Pastry add-on did exactly what I needed using the insert nuns command.
Thanks for the help.

Resources